diff --git a/3rdparty/tinyexr/tinyexr.h b/3rdparty/tinyexr/tinyexr.h
index 75f8839..850cc4c 100644
--- a/3rdparty/tinyexr/tinyexr.h
+++ b/3rdparty/tinyexr/tinyexr.h
@@ -124,7 +124,8 @@ extern "C" {
 #define TINYEXR_PIXELTYPE_HALF (1)
 #define TINYEXR_PIXELTYPE_FLOAT (2)
 
-#define TINYEXR_MAX_ATTRIBUTES (128)
+#define TINYEXR_MAX_HEADER_ATTRIBUTES (1024)
+#define TINYEXR_MAX_CUSTOM_ATTRIBUTES (128)
 
 #define TINYEXR_COMPRESSIONTYPE_NONE (0)
 #define TINYEXR_COMPRESSIONTYPE_RLE (1)
@@ -206,7 +207,8 @@ typedef struct _EXRHeader {
   // Custom attributes(exludes required attributes(e.g. `channels`,
   // `compression`, etc)
   int num_custom_attributes;
-  EXRAttribute custom_attributes[TINYEXR_MAX_ATTRIBUTES];
+  EXRAttribute *custom_attributes;  // array of EXRAttribute. size =
+                                    // `num_custom_attributes`.
 
   EXRChannelInfo *channels;  // [num_channels]
 
@@ -6939,6 +6941,14 @@ void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename,
 
 static const int kEXRVersionSize = 8;
 
+static void cpy2(unsigned short *dst_val, const unsigned short *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+}
+
 static void swap2(unsigned short *val) {
 #ifdef MINIZ_LITTLE_ENDIAN
   (void)val;
@@ -6952,6 +6962,36 @@ static void swap2(unsigned short *val) {
 #endif
 }
 
+static void cpy4(int *dst_val, const int *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+
+static void cpy4(unsigned int *dst_val, const unsigned int *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+
+static void cpy4(float *dst_val, const float *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+}
+
 static void swap4(unsigned int *val) {
 #ifdef MINIZ_LITTLE_ENDIAN
   (void)val;
@@ -6967,6 +7007,22 @@ static void swap4(unsigned int *val) {
 #endif
 }
 
+#if 0
+static void cpy8(tinyexr::tinyexr_uint64 *dst_val, const tinyexr::tinyexr_uint64 *src_val) {
+  unsigned char *dst = reinterpret_cast<unsigned char *>(dst_val);
+  const unsigned char *src = reinterpret_cast<const unsigned char *>(src_val);
+
+  dst[0] = src[0];
+  dst[1] = src[1];
+  dst[2] = src[2];
+  dst[3] = src[3];
+  dst[4] = src[4];
+  dst[5] = src[5];
+  dst[6] = src[6];
+  dst[7] = src[7];
+}
+#endif
+
 static void swap8(tinyexr::tinyexr_uint64 *val) {
 #ifdef MINIZ_LITTLE_ENDIAN
   (void)val;
@@ -8215,8 +8271,8 @@ static void hufBuildEncTable(
   //    for all array entries.
   //
 
-  int hlink[HUF_ENCSIZE];
-  long long *fHeap[HUF_ENCSIZE];
+  std::vector<int> hlink(HUF_ENCSIZE);
+  std::vector<long long *> fHeap(HUF_ENCSIZE);
 
   *im = 0;
 
@@ -8275,8 +8331,8 @@ static void hufBuildEncTable(
 
   std::make_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
 
-  long long scode[HUF_ENCSIZE];
-  memset(scode, 0, sizeof(long long) * HUF_ENCSIZE);
+  std::vector<long long> scode(HUF_ENCSIZE);
+  memset(scode.data(), 0, sizeof(long long) * HUF_ENCSIZE);
 
   while (nf > 1) {
     //
@@ -8348,8 +8404,8 @@ static void hufBuildEncTable(
   // code table from scode into frq.
   //
 
-  hufCanonicalCodeTable(scode);
-  memcpy(frq, scode, sizeof(long long) * HUF_ENCSIZE);
+  hufCanonicalCodeTable(scode.data());
+  memcpy(frq, scode.data(), sizeof(long long) * HUF_ENCSIZE);
 }
 
 //
@@ -8813,7 +8869,7 @@ static bool hufDecode(const long long *hcode,  // i : encoding table
   return true;
 }
 
-static void countFrequencies(long long freq[HUF_ENCSIZE],
+static void countFrequencies(std::vector<long long> &freq,
                              const unsigned short data[/*n*/], int n) {
   for (int i = 0; i < HUF_ENCSIZE; ++i) freq[i] = 0;
 
@@ -8844,21 +8900,21 @@ static int hufCompress(const unsigned short raw[], int nRaw,
                        char compressed[]) {
   if (nRaw == 0) return 0;
 
-  long long freq[HUF_ENCSIZE];
+  std::vector<long long> freq(HUF_ENCSIZE);
 
   countFrequencies(freq, raw, nRaw);
 
   int im = 0;
   int iM = 0;
-  hufBuildEncTable(freq, &im, &iM);
+  hufBuildEncTable(freq.data(), &im, &iM);
 
   char *tableStart = compressed + 20;
   char *tableEnd = tableStart;
-  hufPackEncTable(freq, im, iM, &tableEnd);
+  hufPackEncTable(freq.data(), im, iM, &tableEnd);
   int tableLength = tableEnd - tableStart;
 
   char *dataStart = tableEnd;
-  int nBits = hufEncode(freq, raw, nRaw, iM, dataStart);
+  int nBits = hufEncode(freq.data(), raw, nRaw, iM, dataStart);
   int data_length = (nBits + 7) / 8;
 
   writeUInt(compressed, im);
@@ -9003,7 +9059,7 @@ static bool CompressPiz(unsigned char *outPtr, unsigned int *outSize,
                         const unsigned char *inPtr, size_t inSize,
                         const std::vector<ChannelInfo> &channelInfo,
                         int data_width, int num_lines) {
-  unsigned char bitmap[BITMAP_SIZE];
+  std::vector<unsigned char> bitmap(BITMAP_SIZE);
   unsigned short minNonZero;
   unsigned short maxNonZero;
 
@@ -9054,12 +9110,12 @@ static bool CompressPiz(unsigned char *outPtr, unsigned int *outSize,
     }
   }
 
-  bitmapFromData(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()), bitmap,
-                 minNonZero, maxNonZero);
+  bitmapFromData(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()),
+                 bitmap.data(), minNonZero, maxNonZero);
 
-  unsigned short lut[USHORT_RANGE];
-  unsigned short maxValue = forwardLutFromBitmap(bitmap, lut);
-  applyLut(lut, &tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()));
+  std::vector<unsigned short> lut(USHORT_RANGE);
+  unsigned short maxValue = forwardLutFromBitmap(bitmap.data(), lut.data());
+  applyLut(lut.data(), &tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()));
 
   //
   // Store range compression info in _outBuffer
@@ -9129,7 +9185,7 @@ static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
     return true;
   }
 
-  unsigned char bitmap[BITMAP_SIZE];
+  std::vector<unsigned char> bitmap(BITMAP_SIZE);
   unsigned short minNonZero;
   unsigned short maxNonZero;
 
@@ -9139,11 +9195,13 @@ static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
   return false;
 #endif
 
-  memset(bitmap, 0, BITMAP_SIZE);
+  memset(bitmap.data(), 0, BITMAP_SIZE);
 
   const unsigned char *ptr = inPtr;
-  minNonZero = *(reinterpret_cast<const unsigned short *>(ptr));
-  maxNonZero = *(reinterpret_cast<const unsigned short *>(ptr + 2));
+  //minNonZero = *(reinterpret_cast<const unsigned short *>(ptr));
+  tinyexr::cpy2(&minNonZero, reinterpret_cast<const unsigned short *>(ptr));
+  //maxNonZero = *(reinterpret_cast<const unsigned short *>(ptr + 2));
+  tinyexr::cpy2(&maxNonZero, reinterpret_cast<const unsigned short *>(ptr + 2));
   ptr += 4;
 
   if (maxNonZero >= BITMAP_SIZE) {
@@ -9156,9 +9214,9 @@ static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
     ptr += maxNonZero - minNonZero + 1;
   }
 
-  unsigned short lut[USHORT_RANGE];
-  memset(lut, 0, sizeof(unsigned short) * USHORT_RANGE);
-  unsigned short maxValue = reverseLutFromBitmap(bitmap, lut);
+  std::vector<unsigned short> lut(USHORT_RANGE);
+  memset(lut.data(), 0, sizeof(unsigned short) * USHORT_RANGE);
+  unsigned short maxValue = reverseLutFromBitmap(bitmap.data(), lut.data());
 
   //
   // Huffman decoding
@@ -9166,7 +9224,8 @@ static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
 
   int length;
 
-  length = *(reinterpret_cast<const int *>(ptr));
+  //length = *(reinterpret_cast<const int *>(ptr));
+  tinyexr::cpy4(&length, reinterpret_cast<const int *>(ptr));
   ptr += sizeof(int);
 
   std::vector<unsigned short> tmpBuffer(tmpBufSize);
@@ -9212,7 +9271,7 @@ static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
   // Expand the pixel data to their original range
   //
 
-  applyLut(lut, &tmpBuffer.at(0), static_cast<int>(tmpBufSize));
+  applyLut(lut.data(), &tmpBuffer.at(0), static_cast<int>(tmpBufSize));
 
   for (int y = 0; y < num_lines; y++) {
     for (size_t i = 0; i < channelData.size(); ++i) {
@@ -9480,7 +9539,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
             FP16 hf;
 
-            hf.u = line_ptr[u];
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
 
             tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
 
@@ -9523,7 +9583,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            unsigned int val = line_ptr[u];
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(&val);
 
@@ -9549,7 +9611,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               v * pixel_data_size * static_cast<size_t>(x_stride) +
               channel_offset_list[c] * static_cast<size_t>(x_stride)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            float val = line_ptr[u];
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -9611,7 +9675,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
             tinyexr::FP16 hf;
 
-            hf.u = line_ptr[u];
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
 
             tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
 
@@ -9654,7 +9719,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            unsigned int val = line_ptr[u];
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(&val);
 
@@ -9680,7 +9747,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            float val = line_ptr[u];
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -9735,7 +9804,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
             tinyexr::FP16 hf;
 
-            hf.u = line_ptr[u];
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
 
             tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
 
@@ -9778,7 +9848,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            unsigned int val = line_ptr[u];
+            unsigned int val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(&val);
 
@@ -9804,7 +9876,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            float val = line_ptr[u];
+            float val;
+            // val = line_ptr[u];
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -9867,7 +9941,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
               &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
                          channel_offset_list[c] * static_cast<size_t>(width)));
           for (size_t u = 0; u < static_cast<size_t>(width); u++) {
-            float val = line_ptr[u];
+            float val;
+            tinyexr::cpy4(&val, line_ptr + u);
 
             tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -9917,7 +9992,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
           for (int u = 0; u < width; u++) {
             tinyexr::FP16 hf;
 
-            hf.u = line_ptr[u];
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
 
             tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
 
@@ -9934,7 +10010,9 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
           for (int u = 0; u < width; u++) {
             tinyexr::FP16 hf;
 
-            hf.u = line_ptr[u];
+            // address may not be aliged. use byte-wise copy for safety.#76
+            // hf.u = line_ptr[u];
+            tinyexr::cpy2(&(hf.u), line_ptr + u);
 
             tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
 
@@ -9958,7 +10036,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
         }
 
         for (int u = 0; u < width; u++) {
-          float val = line_ptr[u];
+          float val;
+          tinyexr::cpy4(&val, line_ptr + u);
 
           tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -9976,7 +10055,8 @@ static bool DecodePixelData(/* out */ unsigned char **out_images,
         }
 
         for (int u = 0; u < width; u++) {
-          unsigned int val = line_ptr[u];
+          unsigned int val;
+          tinyexr::cpy4(&val, line_ptr + u);
 
           tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
 
@@ -10153,7 +10233,7 @@ static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
 
   // Read attributes
   size_t orig_size = size;
-  for (;;) {
+  for (size_t nattr = 0; nattr < TINYEXR_MAX_HEADER_ATTRIBUTES; nattr++) {
     if (0 == size) {
       return TINYEXR_ERROR_INVALID_DATA;
     } else if (marker[0] == '\0') {
@@ -10316,8 +10396,8 @@ static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
         tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->chunk_count));
       }
     } else {
-      // Custom attribute(up to TINYEXR_MAX_ATTRIBUTES)
-      if (info->attributes.size() < TINYEXR_MAX_ATTRIBUTES) {
+      // Custom attribute(up to TINYEXR_MAX_CUSTOM_ATTRIBUTES)
+      if (info->attributes.size() < TINYEXR_MAX_CUSTOM_ATTRIBUTES) {
         EXRAttribute attrib;
 #ifdef _MSC_VER
         strncpy_s(attrib.name, attr_name.c_str(), 255);
@@ -10447,15 +10527,30 @@ static void ConvertHeader(EXRHeader *exr_header, const HeaderInfo &info) {
     exr_header->requested_pixel_types[c] = info.channels[c].pixel_type;
   }
 
-  assert(info.attributes.size() < TINYEXR_MAX_ATTRIBUTES);
   exr_header->num_custom_attributes = static_cast<int>(info.attributes.size());
 
-  for (size_t i = 0; i < info.attributes.size(); i++) {
-    memcpy(exr_header->custom_attributes[i].name, info.attributes[i].name, 256);
-    memcpy(exr_header->custom_attributes[i].type, info.attributes[i].type, 256);
-    exr_header->custom_attributes[i].size = info.attributes[i].size;
-    // Just copy poiner
-    exr_header->custom_attributes[i].value = info.attributes[i].value;
+  if (exr_header->num_custom_attributes > 0) {
+    // TODO(syoyo): Report warning when # of attributes exceeds
+    // `TINYEXR_MAX_CUSTOM_ATTRIBUTES`
+    if (exr_header->num_custom_attributes > TINYEXR_MAX_CUSTOM_ATTRIBUTES) {
+      exr_header->num_custom_attributes = TINYEXR_MAX_CUSTOM_ATTRIBUTES;
+    }
+
+    exr_header->custom_attributes = static_cast<EXRAttribute *>(malloc(
+        sizeof(EXRAttribute) * size_t(exr_header->num_custom_attributes)));
+
+    for (size_t i = 0; i < info.attributes.size(); i++) {
+      memcpy(exr_header->custom_attributes[i].name, info.attributes[i].name,
+             256);
+      memcpy(exr_header->custom_attributes[i].type, info.attributes[i].type,
+             256);
+      exr_header->custom_attributes[i].size = info.attributes[i].size;
+      // Just copy poiner
+      exr_header->custom_attributes[i].value = info.attributes[i].value;
+    }
+
+  } else {
+    exr_header->custom_attributes = NULL;
   }
 
   exr_header->header_len = info.header_len;
@@ -11458,7 +11553,8 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
                   static_cast<size_t>(pixel_data_size * y * exr_image->width) +
                   channel_offset_list[c] *
                       static_cast<size_t>(exr_image->width)));
-              line_ptr[x] = f32.f;
+              // line_ptr[x] = f32.f;
+              tinyexr::cpy4(line_ptr + x, &(f32.f));
             }
           }
         } else if (exr_header->requested_pixel_types[c] ==
@@ -11476,7 +11572,8 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
                                               exr_image->width) +
                           channel_offset_list[c] *
                               static_cast<size_t>(exr_image->width)));
-              line_ptr[x] = val;
+              // line_ptr[x] = val;
+              tinyexr::cpy2(line_ptr + x, &val);
             }
           }
         } else {
@@ -11502,7 +11599,8 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
                                               exr_image->width) +
                           channel_offset_list[c] *
                               static_cast<size_t>(exr_image->width)));
-              line_ptr[x] = h16.u;
+              // line_ptr[x] = h16.u;
+              tinyexr::cpy2(line_ptr + x, &(h16.u));
             }
           }
         } else if (exr_header->requested_pixel_types[c] ==
@@ -11519,7 +11617,8 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
                   static_cast<size_t>(pixel_data_size * y * exr_image->width) +
                   channel_offset_list[c] *
                       static_cast<size_t>(exr_image->width)));
-              line_ptr[x] = val;
+              // line_ptr[x] = val;
+              tinyexr::cpy4(line_ptr + x, &val);
             }
           }
         } else {
@@ -11538,7 +11637,8 @@ size_t SaveEXRImageToMemory(const EXRImage *exr_image,
                 static_cast<size_t>(pixel_data_size * y * exr_image->width) +
                 channel_offset_list[c] *
                     static_cast<size_t>(exr_image->width)));
-            line_ptr[x] = val;
+            // line_ptr[x] = val;
+            tinyexr::cpy4(line_ptr + x, &val);
           }
         }
       }
@@ -11768,7 +11868,7 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
 #ifdef _MSC_VER
   FILE *fp = NULL;
   errno_t errcode = fopen_s(&fp, filename, "rb");
-  if ((!errcode) || (!fp)) {
+  if ((0 != errcode) || (!fp)) {
     if (err) {
       (*err) = "Cannot read file.";
     }
@@ -12103,8 +12203,10 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
 
         if (channels[c].pixel_type == 0) {  // UINT
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
-            unsigned int ui = *reinterpret_cast<unsigned int *>(
+            unsigned int ui;
+            unsigned int *src_ptr = reinterpret_cast<unsigned int *>(
                 &sample_data.at(size_t(data_offset) + x * sizeof(int)));
+            tinyexr::cpy4(&ui, src_ptr);
             deep_image->image[c][y][x] = static_cast<float>(ui);  // @fixme
           }
           data_offset +=
@@ -12112,16 +12214,19 @@ int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
         } else if (channels[c].pixel_type == 1) {  // half
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
             tinyexr::FP16 f16;
-            f16.u = *reinterpret_cast<unsigned short *>(
+            const unsigned short *src_ptr = reinterpret_cast<unsigned short *>(
                 &sample_data.at(size_t(data_offset) + x * sizeof(short)));
+            tinyexr::cpy2(&(f16.u), src_ptr);
             tinyexr::FP32 f32 = half_to_float(f16);
             deep_image->image[c][y][x] = f32.f;
           }
           data_offset += sizeof(short) * static_cast<size_t>(samples_per_line);
         } else {  // float
           for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
-            float f = *reinterpret_cast<float *>(
+            float f;
+            const float *src_ptr = reinterpret_cast<float *>(
                 &sample_data.at(size_t(data_offset) + x * sizeof(float)));
+            tinyexr::cpy4(&f, src_ptr);
             deep_image->image[c][y][x] = f;
           }
           data_offset += sizeof(float) * static_cast<size_t>(samples_per_line);
@@ -12193,6 +12298,10 @@ int FreeEXRHeader(EXRHeader *exr_header) {
     }
   }
 
+  if (exr_header->custom_attributes) {
+    free(exr_header->custom_attributes);
+  }
+
   return TINYEXR_SUCCESS;
 }
 
@@ -12222,6 +12331,7 @@ int FreeEXRImage(EXRImage *exr_image) {
         free(exr_image->tiles[tid].images);
       }
     }
+    free(exr_image->tiles);
   }
 
   return TINYEXR_SUCCESS;
diff --git a/include/bimg/bimg.h b/include/bimg/bimg.h
index 8122b9e..8f62b74 100644
--- a/include/bimg/bimg.h
+++ b/include/bimg/bimg.h
@@ -9,7 +9,7 @@
 #include <stdint.h> // uint32_t
 #include <stdlib.h> // NULL
 
-#define BIMG_API_VERSION UINT32_C(5)
+#define BIMG_API_VERSION UINT32_C(6)
 
 namespace bx
 {
@@ -404,7 +404,8 @@ namespace bimg
 
 	///
 	bool imageConvert(
-		  void* _dst
+		  bx::AllocatorI* _allocator
+		, void* _dst
 		, TextureFormat::Enum _dstFormat
 		, const void* _src
 		, TextureFormat::Enum _srcFormat
@@ -426,6 +427,7 @@ namespace bimg
 		  bx::AllocatorI* _allocator
 		, TextureFormat::Enum _dstFormat
 		, const ImageContainer& _input
+		, bool _convertMips = true
 		);
 
 	///
@@ -575,7 +577,8 @@ namespace bimg
 
 	///
 	void imageDecodeToBgra8(
-		  void* _dst
+		  bx::AllocatorI* _allocator
+		, void* _dst
 		, const void* _src
 		, uint32_t _width
 		, uint32_t _height
@@ -585,7 +588,8 @@ namespace bimg
 
 	///
 	void imageDecodeToRgba8(
-		  void* _dst
+		  bx::AllocatorI* _allocator
+		, void* _dst
 		, const void* _src
 		, uint32_t _width
 		, uint32_t _height
diff --git a/include/bimg/encode.h b/include/bimg/encode.h
index bab8de7..b508745 100644
--- a/include/bimg/encode.h
+++ b/include/bimg/encode.h
@@ -24,7 +24,8 @@ namespace bimg
 
 	///
 	void imageEncodeFromRgba8(
-		  void* _dst
+		  bx::AllocatorI* _allocator
+		, void* _dst
 		, const void* _src
 		, uint32_t _width
 		, uint32_t _height
diff --git a/src/image.cpp b/src/image.cpp
index 29a7236..3afcba4 100644
--- a/src/image.cpp
+++ b/src/image.cpp
@@ -969,13 +969,31 @@ namespace bimg
 		}
 	}
 
-	bool imageConvert(void* _dst, TextureFormat::Enum _dstFormat, const void* _src, TextureFormat::Enum _srcFormat, uint32_t _width, uint32_t _height, uint32_t _depth, uint32_t _srcPitch)
+	bool imageConvert(bx::AllocatorI* _allocator, void* _dst, TextureFormat::Enum _dstFormat, const void* _src, TextureFormat::Enum _srcFormat, uint32_t _width, uint32_t _height, uint32_t _depth, uint32_t _srcPitch)
 	{
 		UnpackFn unpack = s_packUnpack[_srcFormat].unpack;
 		PackFn   pack   = s_packUnpack[_dstFormat].pack;
 		if (NULL == pack
 		||  NULL == unpack)
 		{
+			switch (_dstFormat)
+			{
+			case TextureFormat::RGBA8:
+				imageDecodeToRgba8(_allocator, _dst, _src, _width, _height, _width*4, _srcFormat);
+				return true;
+
+			case TextureFormat::BGRA8:
+				imageDecodeToBgra8(_allocator, _dst, _src, _width, _height, _width*4, _srcFormat);
+				return true;
+
+			case TextureFormat::RGBA32F:
+				imageDecodeToRgba32f(_allocator, _dst, _src, _width, _height, 1, _width*16, _srcFormat);
+				return true;
+
+			default:
+				break;
+			}
+
 			return false;
 		}
 
@@ -986,7 +1004,7 @@ namespace bimg
 		return true;
 	}
 
-	bool imageConvert(void* _dst, TextureFormat::Enum _dstFormat, const void* _src, TextureFormat::Enum _srcFormat, uint32_t _width, uint32_t _height, uint32_t _depth)
+	bool imageConvert(bx::AllocatorI* _allocator, void* _dst, TextureFormat::Enum _dstFormat, const void* _src, TextureFormat::Enum _srcFormat, uint32_t _width, uint32_t _height, uint32_t _depth)
 	{
 		const uint32_t srcBpp = s_imageBlockInfo[_srcFormat].bitsPerPixel;
 
@@ -996,10 +1014,10 @@ namespace bimg
 			return true;
 		}
 
-		return imageConvert(_dst, _dstFormat, _src, _srcFormat, _width, _height, _depth, _width*srcBpp/8);
+		return imageConvert(_allocator, _dst, _dstFormat, _src, _srcFormat, _width, _height, _depth, _width*srcBpp/8);
 	}
 
-	ImageContainer* imageConvert(bx::AllocatorI* _allocator, TextureFormat::Enum _dstFormat, const ImageContainer& _input)
+	ImageContainer* imageConvert(bx::AllocatorI* _allocator, TextureFormat::Enum _dstFormat, const ImageContainer& _input, bool _convertMips)
 	{
 		ImageContainer* output = imageAlloc(_allocator
 			, _dstFormat
@@ -1008,14 +1026,14 @@ namespace bimg
 			, uint16_t(_input.m_depth)
 			, _input.m_numLayers
 			, _input.m_cubeMap
-			, 1 < _input.m_numMips
+			, _convertMips && 1 < _input.m_numMips
 			);
 
 		const uint16_t numSides = _input.m_numLayers * (_input.m_cubeMap ? 6 : 1);
 
 		for (uint16_t side = 0; side < numSides; ++side)
 		{
-			for (uint8_t lod = 0, num = _input.m_numMips; lod < num; ++lod)
+			for (uint8_t lod = 0, num = _convertMips ? _input.m_numMips : 1; lod < num; ++lod)
 			{
 				ImageMip mip;
 				if (imageGetRawData(_input, side, lod, _input.m_data, _input.m_size, mip) )
@@ -1024,14 +1042,16 @@ namespace bimg
 					imageGetRawData(*output, side, lod, output->m_data, output->m_size, dstMip);
 					uint8_t* dstData = const_cast<uint8_t*>(dstMip.m_data);
 
-					bool ok = imageConvert(dstData
-							, _dstFormat
-							, mip.m_data
-							, mip.m_format
-							, mip.m_width
-							, mip.m_height
-							, mip.m_depth
-							);
+					bool ok = imageConvert(
+						  _allocator
+						, dstData
+						, _dstFormat
+						, mip.m_data
+						, mip.m_format
+						, mip.m_width
+						, mip.m_height
+						, mip.m_depth
+						);
 					BX_CHECK(ok, "Conversion from %s to %s failed!"
 							, getName(_input.m_format)
 							, getName(output->m_format)
@@ -1240,19 +1260,997 @@ namespace bimg
 		}
 	}
 
-	static const int32_t s_etc1Mod[8][4] =
-	{
-		{  2,   8,  -2,   -8},
-		{  5,  17,  -5,  -17},
-		{  9,  29,  -9,  -29},
-		{ 13,  42, -13,  -42},
-		{ 18,  60, -18,  -60},
-		{ 24,  80, -24,  -80},
-		{ 33, 106, -33, -106},
-		{ 47, 183, -47, -183},
+	// BC6H, BC7
+	//
+	// Reference:
+	//
+	// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_bptc.txt
+	// https://msdn.microsoft.com/en-us/library/windows/desktop/hh308952(v=vs.85).aspx
+
+	static const uint16_t s_bptcP2[] =
+	{ //  3210     0000000000   1111111111   2222222222   3333333333
+		0xcccc, // 0, 0, 1, 1,  0, 0, 1, 1,  0, 0, 1, 1,  0, 0, 1, 1
+		0x8888, // 0, 0, 0, 1,  0, 0, 0, 1,  0, 0, 0, 1,  0, 0, 0, 1
+		0xeeee, // 0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1
+		0xecc8, // 0, 0, 0, 1,  0, 0, 1, 1,  0, 0, 1, 1,  0, 1, 1, 1
+		0xc880, // 0, 0, 0, 0,  0, 0, 0, 1,  0, 0, 0, 1,  0, 0, 1, 1
+		0xfeec, // 0, 0, 1, 1,  0, 1, 1, 1,  0, 1, 1, 1,  1, 1, 1, 1
+		0xfec8, // 0, 0, 0, 1,  0, 0, 1, 1,  0, 1, 1, 1,  1, 1, 1, 1
+		0xec80, // 0, 0, 0, 0,  0, 0, 0, 1,  0, 0, 1, 1,  0, 1, 1, 1
+		0xc800, // 0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  0, 0, 1, 1
+		0xffec, // 0, 0, 1, 1,  0, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1
+		0xfe80, // 0, 0, 0, 0,  0, 0, 0, 1,  0, 1, 1, 1,  1, 1, 1, 1
+		0xe800, // 0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  0, 1, 1, 1
+		0xffe8, // 0, 0, 0, 1,  0, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1
+		0xff00, // 0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1
+		0xfff0, // 0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1
+		0xf000, // 0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  1, 1, 1, 1
+		0xf710, // 0, 0, 0, 0,  1, 0, 0, 0,  1, 1, 1, 0,  1, 1, 1, 1
+		0x008e, // 0, 1, 1, 1,  0, 0, 0, 1,  0, 0, 0, 0,  0, 0, 0, 0
+		0x7100, // 0, 0, 0, 0,  0, 0, 0, 0,  1, 0, 0, 0,  1, 1, 1, 0
+		0x08ce, // 0, 1, 1, 1,  0, 0, 1, 1,  0, 0, 0, 1,  0, 0, 0, 0
+		0x008c, // 0, 0, 1, 1,  0, 0, 0, 1,  0, 0, 0, 0,  0, 0, 0, 0
+		0x7310, // 0, 0, 0, 0,  1, 0, 0, 0,  1, 1, 0, 0,  1, 1, 1, 0
+		0x3100, // 0, 0, 0, 0,  0, 0, 0, 0,  1, 0, 0, 0,  1, 1, 0, 0
+		0x8cce, // 0, 1, 1, 1,  0, 0, 1, 1,  0, 0, 1, 1,  0, 0, 0, 1
+		0x088c, // 0, 0, 1, 1,  0, 0, 0, 1,  0, 0, 0, 1,  0, 0, 0, 0
+		0x3110, // 0, 0, 0, 0,  1, 0, 0, 0,  1, 0, 0, 0,  1, 1, 0, 0
+		0x6666, // 0, 1, 1, 0,  0, 1, 1, 0,  0, 1, 1, 0,  0, 1, 1, 0
+		0x366c, // 0, 0, 1, 1,  0, 1, 1, 0,  0, 1, 1, 0,  1, 1, 0, 0
+		0x17e8, // 0, 0, 0, 1,  0, 1, 1, 1,  1, 1, 1, 0,  1, 0, 0, 0
+		0x0ff0, // 0, 0, 0, 0,  1, 1, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0
+		0x718e, // 0, 1, 1, 1,  0, 0, 0, 1,  1, 0, 0, 0,  1, 1, 1, 0
+		0x399c, // 0, 0, 1, 1,  1, 0, 0, 1,  1, 0, 0, 1,  1, 1, 0, 0
+		0xaaaa, // 0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1,  0, 1, 0, 1
+		0xf0f0, // 0, 0, 0, 0,  1, 1, 1, 1,  0, 0, 0, 0,  1, 1, 1, 1
+		0x5a5a, // 0, 1, 0, 1,  1, 0, 1, 0,  0, 1, 0, 1,  1, 0, 1, 0
+		0x33cc, // 0, 0, 1, 1,  0, 0, 1, 1,  1, 1, 0, 0,  1, 1, 0, 0
+		0x3c3c, // 0, 0, 1, 1,  1, 1, 0, 0,  0, 0, 1, 1,  1, 1, 0, 0
+		0x55aa, // 0, 1, 0, 1,  0, 1, 0, 1,  1, 0, 1, 0,  1, 0, 1, 0
+		0x9696, // 0, 1, 1, 0,  1, 0, 0, 1,  0, 1, 1, 0,  1, 0, 0, 1
+		0xa55a, // 0, 1, 0, 1,  1, 0, 1, 0,  1, 0, 1, 0,  0, 1, 0, 1
+		0x73ce, // 0, 1, 1, 1,  0, 0, 1, 1,  1, 1, 0, 0,  1, 1, 1, 0
+		0x13c8, // 0, 0, 0, 1,  0, 0, 1, 1,  1, 1, 0, 0,  1, 0, 0, 0
+		0x324c, // 0, 0, 1, 1,  0, 0, 1, 0,  0, 1, 0, 0,  1, 1, 0, 0
+		0x3bdc, // 0, 0, 1, 1,  1, 0, 1, 1,  1, 1, 0, 1,  1, 1, 0, 0
+		0x6996, // 0, 1, 1, 0,  1, 0, 0, 1,  1, 0, 0, 1,  0, 1, 1, 0
+		0xc33c, // 0, 0, 1, 1,  1, 1, 0, 0,  1, 1, 0, 0,  0, 0, 1, 1
+		0x9966, // 0, 1, 1, 0,  0, 1, 1, 0,  1, 0, 0, 1,  1, 0, 0, 1
+		0x0660, // 0, 0, 0, 0,  0, 1, 1, 0,  0, 1, 1, 0,  0, 0, 0, 0
+		0x0272, // 0, 1, 0, 0,  1, 1, 1, 0,  0, 1, 0, 0,  0, 0, 0, 0
+		0x04e4, // 0, 0, 1, 0,  0, 1, 1, 1,  0, 0, 1, 0,  0, 0, 0, 0
+		0x4e40, // 0, 0, 0, 0,  0, 0, 1, 0,  0, 1, 1, 1,  0, 0, 1, 0
+		0x2720, // 0, 0, 0, 0,  0, 1, 0, 0,  1, 1, 1, 0,  0, 1, 0, 0
+		0xc936, // 0, 1, 1, 0,  1, 1, 0, 0,  1, 0, 0, 1,  0, 0, 1, 1
+		0x936c, // 0, 0, 1, 1,  0, 1, 1, 0,  1, 1, 0, 0,  1, 0, 0, 1
+		0x39c6, // 0, 1, 1, 0,  0, 0, 1, 1,  1, 0, 0, 1,  1, 1, 0, 0
+		0x639c, // 0, 0, 1, 1,  1, 0, 0, 1,  1, 1, 0, 0,  0, 1, 1, 0
+		0x9336, // 0, 1, 1, 0,  1, 1, 0, 0,  1, 1, 0, 0,  1, 0, 0, 1
+		0x9cc6, // 0, 1, 1, 0,  0, 0, 1, 1,  0, 0, 1, 1,  1, 0, 0, 1
+		0x817e, // 0, 1, 1, 1,  1, 1, 1, 0,  1, 0, 0, 0,  0, 0, 0, 1
+		0xe718, // 0, 0, 0, 1,  1, 0, 0, 0,  1, 1, 1, 0,  0, 1, 1, 1
+		0xccf0, // 0, 0, 0, 0,  1, 1, 1, 1,  0, 0, 1, 1,  0, 0, 1, 1
+		0x0fcc, // 0, 0, 1, 1,  0, 0, 1, 1,  1, 1, 1, 1,  0, 0, 0, 0
+		0x7744, // 0, 0, 1, 0,  0, 0, 1, 0,  1, 1, 1, 0,  1, 1, 1, 0
+		0xee22, // 0, 1, 0, 0,  0, 1, 0, 0,  0, 1, 1, 1,  0, 1, 1, 1
 	};
 
-	static const uint8_t s_etc2Mod[8] = { 3, 6, 11, 16, 23, 32, 41, 64 };
+	static const uint32_t s_bptcP3[] =
+	{ //  76543210     0000   1111   2222   3333   4444   5555   6666   7777
+		0xaa685050, // 0, 0,  1, 1,  0, 0,  1, 1,  0, 2,  2, 1,  2, 2,  2, 2
+		0x6a5a5040,	// 0, 0,  0, 1,  0, 0,  1, 1,  2, 2,  1, 1,  2, 2,  2, 1
+		0x5a5a4200,	// 0, 0,  0, 0,  2, 0,  0, 1,  2, 2,  1, 1,  2, 2,  1, 1
+		0x5450a0a8,	// 0, 2,  2, 2,  0, 0,  2, 2,  0, 0,  1, 1,  0, 1,  1, 1
+		0xa5a50000,	// 0, 0,  0, 0,  0, 0,  0, 0,  1, 1,  2, 2,  1, 1,  2, 2
+		0xa0a05050,	// 0, 0,  1, 1,  0, 0,  1, 1,  0, 0,  2, 2,  0, 0,  2, 2
+		0x5555a0a0,	// 0, 0,  2, 2,  0, 0,  2, 2,  1, 1,  1, 1,  1, 1,  1, 1
+		0x5a5a5050,	// 0, 0,  1, 1,  0, 0,  1, 1,  2, 2,  1, 1,  2, 2,  1, 1
+		0xaa550000,	// 0, 0,  0, 0,  0, 0,  0, 0,  1, 1,  1, 1,  2, 2,  2, 2
+		0xaa555500,	// 0, 0,  0, 0,  1, 1,  1, 1,  1, 1,  1, 1,  2, 2,  2, 2
+		0xaaaa5500,	// 0, 0,  0, 0,  1, 1,  1, 1,  2, 2,  2, 2,  2, 2,  2, 2
+		0x90909090,	// 0, 0,  1, 2,  0, 0,  1, 2,  0, 0,  1, 2,  0, 0,  1, 2
+		0x94949494,	// 0, 1,  1, 2,  0, 1,  1, 2,  0, 1,  1, 2,  0, 1,  1, 2
+		0xa4a4a4a4,	// 0, 1,  2, 2,  0, 1,  2, 2,  0, 1,  2, 2,  0, 1,  2, 2
+		0xa9a59450,	// 0, 0,  1, 1,  0, 1,  1, 2,  1, 1,  2, 2,  1, 2,  2, 2
+		0x2a0a4250,	// 0, 0,  1, 1,  2, 0,  0, 1,  2, 2,  0, 0,  2, 2,  2, 0
+		0xa5945040,	// 0, 0,  0, 1,  0, 0,  1, 1,  0, 1,  1, 2,  1, 1,  2, 2
+		0x0a425054,	// 0, 1,  1, 1,  0, 0,  1, 1,  2, 0,  0, 1,  2, 2,  0, 0
+		0xa5a5a500,	// 0, 0,  0, 0,  1, 1,  2, 2,  1, 1,  2, 2,  1, 1,  2, 2
+		0x55a0a0a0,	// 0, 0,  2, 2,  0, 0,  2, 2,  0, 0,  2, 2,  1, 1,  1, 1
+		0xa8a85454,	// 0, 1,  1, 1,  0, 1,  1, 1,  0, 2,  2, 2,  0, 2,  2, 2
+		0x6a6a4040,	// 0, 0,  0, 1,  0, 0,  0, 1,  2, 2,  2, 1,  2, 2,  2, 1
+		0xa4a45000,	// 0, 0,  0, 0,  0, 0,  1, 1,  0, 1,  2, 2,  0, 1,  2, 2
+		0x1a1a0500,	// 0, 0,  0, 0,  1, 1,  0, 0,  2, 2,  1, 0,  2, 2,  1, 0
+		0x0050a4a4,	// 0, 1,  2, 2,  0, 1,  2, 2,  0, 0,  1, 1,  0, 0,  0, 0
+		0xaaa59090,	// 0, 0,  1, 2,  0, 0,  1, 2,  1, 1,  2, 2,  2, 2,  2, 2
+		0x14696914,	// 0, 1,  1, 0,  1, 2,  2, 1,  1, 2,  2, 1,  0, 1,  1, 0
+		0x69691400,	// 0, 0,  0, 0,  0, 1,  1, 0,  1, 2,  2, 1,  1, 2,  2, 1
+		0xa08585a0,	// 0, 0,  2, 2,  1, 1,  0, 2,  1, 1,  0, 2,  0, 0,  2, 2
+		0xaa821414,	// 0, 1,  1, 0,  0, 1,  1, 0,  2, 0,  0, 2,  2, 2,  2, 2
+		0x50a4a450,	// 0, 0,  1, 1,  0, 1,  2, 2,  0, 1,  2, 2,  0, 0,  1, 1
+		0x6a5a0200,	// 0, 0,  0, 0,  2, 0,  0, 0,  2, 2,  1, 1,  2, 2,  2, 1
+		0xa9a58000,	// 0, 0,  0, 0,  0, 0,  0, 2,  1, 1,  2, 2,  1, 2,  2, 2
+		0x5090a0a8,	// 0, 2,  2, 2,  0, 0,  2, 2,  0, 0,  1, 2,  0, 0,  1, 1
+		0xa8a09050,	// 0, 0,  1, 1,  0, 0,  1, 2,  0, 0,  2, 2,  0, 2,  2, 2
+		0x24242424,	// 0, 1,  2, 0,  0, 1,  2, 0,  0, 1,  2, 0,  0, 1,  2, 0
+		0x00aa5500,	// 0, 0,  0, 0,  1, 1,  1, 1,  2, 2,  2, 2,  0, 0,  0, 0
+		0x24924924,	// 0, 1,  2, 0,  1, 2,  0, 1,  2, 0,  1, 2,  0, 1,  2, 0
+		0x24499224,	// 0, 1,  2, 0,  2, 0,  1, 2,  1, 2,  0, 1,  0, 1,  2, 0
+		0x50a50a50,	// 0, 0,  1, 1,  2, 2,  0, 0,  1, 1,  2, 2,  0, 0,  1, 1
+		0x500aa550,	// 0, 0,  1, 1,  1, 1,  2, 2,  2, 2,  0, 0,  0, 0,  1, 1
+		0xaaaa4444,	// 0, 1,  0, 1,  0, 1,  0, 1,  2, 2,  2, 2,  2, 2,  2, 2
+		0x66660000,	// 0, 0,  0, 0,  0, 0,  0, 0,  2, 1,  2, 1,  2, 1,  2, 1
+		0xa5a0a5a0,	// 0, 0,  2, 2,  1, 1,  2, 2,  0, 0,  2, 2,  1, 1,  2, 2
+		0x50a050a0,	// 0, 0,  2, 2,  0, 0,  1, 1,  0, 0,  2, 2,  0, 0,  1, 1
+		0x69286928,	// 0, 2,  2, 0,  1, 2,  2, 1,  0, 2,  2, 0,  1, 2,  2, 1
+		0x44aaaa44,	// 0, 1,  0, 1,  2, 2,  2, 2,  2, 2,  2, 2,  0, 1,  0, 1
+		0x66666600,	// 0, 0,  0, 0,  2, 1,  2, 1,  2, 1,  2, 1,  2, 1,  2, 1
+		0xaa444444,	// 0, 1,  0, 1,  0, 1,  0, 1,  0, 1,  0, 1,  2, 2,  2, 2
+		0x54a854a8,	// 0, 2,  2, 2,  0, 1,  1, 1,  0, 2,  2, 2,  0, 1,  1, 1
+		0x95809580,	// 0, 0,  0, 2,  1, 1,  1, 2,  0, 0,  0, 2,  1, 1,  1, 2
+		0x96969600,	// 0, 0,  0, 0,  2, 1,  1, 2,  2, 1,  1, 2,  2, 1,  1, 2
+		0xa85454a8,	// 0, 2,  2, 2,  0, 1,  1, 1,  0, 1,  1, 1,  0, 2,  2, 2
+		0x80959580,	// 0, 0,  0, 2,  1, 1,  1, 2,  1, 1,  1, 2,  0, 0,  0, 2
+		0xaa141414,	// 0, 1,  1, 0,  0, 1,  1, 0,  0, 1,  1, 0,  2, 2,  2, 2
+		0x96960000,	// 0, 0,  0, 0,  0, 0,  0, 0,  2, 1,  1, 2,  2, 1,  1, 2
+		0xaaaa1414,	// 0, 1,  1, 0,  0, 1,  1, 0,  2, 2,  2, 2,  2, 2,  2, 2
+		0xa05050a0,	// 0, 0,  2, 2,  0, 0,  1, 1,  0, 0,  1, 1,  0, 0,  2, 2
+		0xa0a5a5a0,	// 0, 0,  2, 2,  1, 1,  2, 2,  1, 1,  2, 2,  0, 0,  2, 2
+		0x96000000,	// 0, 0,  0, 0,  0, 0,  0, 0,  0, 0,  0, 0,  2, 1,  1, 2
+		0x40804080,	// 0, 0,  0, 2,  0, 0,  0, 1,  0, 0,  0, 2,  0, 0,  0, 1
+		0xa9a8a9a8,	// 0, 2,  2, 2,  1, 2,  2, 2,  0, 2,  2, 2,  1, 2,  2, 2
+		0xaaaaaa44,	// 0, 1,  0, 1,  2, 2,  2, 2,  2, 2,  2, 2,  2, 2,  2, 2
+		0x2a4a5254,	// 0, 1,  1, 1,  2, 0,  1, 1,  2, 2,  0, 1,  2, 2,  2, 0
+	};
+
+	static const uint8_t s_bptcA2[] =
+	{
+		15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15,
+		15,  2,  8,  2,  2,  8,  8, 15,
+		 2,  8,  2,  2,  8,  8,  2,  2,
+		15, 15,  6,  8,  2,  8, 15, 15,
+		 2,  8,  2,  2,  2, 15, 15,  6,
+		 6,  2,  6,  8, 15, 15,  2,  2,
+		15, 15, 15, 15, 15,  2,  2, 15,
+	};
+
+	static const uint8_t s_bptcA3[2][64] =
+	{
+		{
+			 3,  3, 15, 15,  8,  3, 15, 15,
+			 8,  8,  6,  6,  6,  5,  3,  3,
+			 3,  3,  8, 15,  3,  3,  6, 10,
+			 5,  8,  8,  6,  8,  5, 15, 15,
+			 8, 15,  3,  5,  6, 10,  8, 15,
+			15,  3, 15,  5, 15, 15, 15, 15,
+			 3, 15,  5,  5,  5,  8,  5, 10,
+			 5, 10,  8, 13, 15, 12,  3,  3,
+		},
+		{
+			15,  8,  8,  3, 15, 15,  3,  8,
+			15, 15, 15, 15, 15, 15, 15,  8,
+			15,  8, 15,  3, 15,  8, 15,  8,
+			 3, 15,  6, 10, 15, 15, 10,  8,
+			15,  3, 15, 10, 10,  8,  9, 10,
+			 6, 15,  8, 15,  3,  6,  6,  8,
+			15,  3, 15, 15, 15, 15, 15, 15,
+			15, 15, 15, 15,  3, 15, 15,  8,
+		},
+	};
+
+	static const uint8_t s_bptcFactors[3][16] =
+	{
+		{  0, 21, 43, 64,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
+		{  0,  9, 18, 27, 37, 46, 55, 64,  0,  0,  0,  0,  0,  0,  0,  0 },
+		{  0,  4,  9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 },
+	};
+
+	struct BitReader
+	{
+		BitReader(const uint8_t* _data, uint16_t _bitPos = 0)
+			: m_data(_data)
+			, m_bitPos(_bitPos)
+		{
+		}
+
+		uint16_t read(uint8_t _numBits)
+		{
+			const uint16_t pos   = m_bitPos / 8;
+			const uint16_t shift = m_bitPos & 7;
+			uint32_t data = 0;
+			bx::memCopy(&data, &m_data[pos], bx::min(4, 16-pos) );
+			m_bitPos += _numBits;
+			return uint16_t( (data >> shift) & ( (1 << _numBits)-1) );
+		}
+
+		uint16_t peek(uint16_t _offset, uint8_t _numBits)
+		{
+			const uint16_t bitPos = m_bitPos + _offset;
+			const uint16_t shift  = bitPos & 7;
+			uint16_t pos  = bitPos / 8;
+			uint32_t data = 0;
+			bx::memCopy(&data, &m_data[pos], bx::min(4, 16-pos) );
+			return uint8_t( (data >> shift) & ( (1 << _numBits)-1) );
+		}
+
+		const uint8_t* m_data;
+		uint16_t m_bitPos;
+	};
+
+	uint16_t bc6hUnquantize(uint16_t _value, bool _signed, uint8_t _endpointBits)
+	{
+		const uint16_t maxValue = 1<<(_endpointBits-1);
+
+		if (_signed)
+		{
+			if (_endpointBits >= 16)
+			{
+				return _value;
+			}
+
+			const bool sign = !!(_value & 0x8000);
+			_value &= 0x7fff;
+
+			uint16_t unq;
+
+			if (0 == _value)
+			{
+				unq = 0;
+			}
+			else if (_value >= maxValue-1)
+			{
+				unq = 0x7fff;
+			}
+			else
+			{
+				unq = ( (_value<<15) + 0x4000) >> (_endpointBits-1);
+			}
+
+			return sign ? -unq : unq;
+		}
+
+		if (_endpointBits >= 15)
+		{
+			return _value;
+		}
+
+		if (0 == _value)
+		{
+			return 0;
+		}
+
+		if (_value == maxValue)
+		{
+			return UINT16_MAX;
+		}
+
+		return ( (_value<<15) + 0x4000) >> (_endpointBits-1);
+	}
+
+	uint16_t bc6hUnquantizeFinal(uint16_t _value, bool _signed)
+	{
+		if (_signed)
+		{
+			const uint16_t sign = _value & 0x8000;
+			_value &= 0x7fff;
+
+			return ( (_value * 31) >> 5) | sign;
+		}
+
+		return (_value * 31) >> 6;
+	}
+
+	uint16_t signExtend(uint16_t _value, uint8_t _numBits)
+	{
+		const uint16_t mask   = 1 << (_numBits - 1);
+		const uint16_t result = (_value ^ mask) - mask;
+
+		return result;
+	}
+
+	struct Bc6hModeInfo
+	{
+		uint8_t transformed;
+		uint8_t partitionBits;
+		uint8_t endpointBits;
+		uint8_t deltaBits[3];
+	};
+
+	static const Bc6hModeInfo s_bc6hModeInfo[] =
+	{ //  +--------------------------- transformed
+	  //  |  +------------------------ partition bits
+	  //  |  |  +--------------------- endpoint bits
+	  //  |  |  |      +-------------- delta bits
+		{ 1, 5, 10, {  5,  5,  5 } }, // 00    2-bits
+		{ 1, 5,  7, {  6,  6,  6 } }, // 01
+		{ 1, 5, 11, {  5,  4,  4 } }, // 00010 5-bits
+		{ 0, 0, 10, { 10, 10, 10 } }, // 00011
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5, 11, {  4,  5,  4 } }, // 00110
+		{ 1, 0, 11, {  9,  9,  9 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5, 11, {  4,  4,  5 } }, // 00010
+		{ 1, 0, 12, {  8,  8,  8 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5,  9, {  5,  5,  5 } }, // 00010
+		{ 1, 0, 16, {  4,  4,  4 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5,  8, {  6,  5,  5 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5,  8, {  5,  6,  5 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 1, 5,  8, {  5,  5,  6 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+		{ 0, 5,  6, {  6,  6,  6 } }, // 00010
+		{ 0, 0,  0, {  0,  0,  0 } }, // -
+	};
+
+	void decodeBlockBc6h(uint16_t _dst[16*3], const uint8_t _src[16], bool _signed)
+	{
+		BitReader bit(_src);
+
+		uint8_t mode = uint8_t(bit.read(2) );
+		if (mode & 2)
+		{
+			// 5-bit mode
+			mode |= bit.read(3) << 2;
+		}
+
+		const Bc6hModeInfo& mi = s_bc6hModeInfo[mode];
+		if (0 == mi.endpointBits)
+		{
+			bx::memSet(_dst, 0, 16*3*2);
+			return;
+		}
+
+		uint16_t epR[4] = { /* rw, rx, ry, rz */ };
+		uint16_t epG[4] = { /* gw, gx, gy, gz */ };
+		uint16_t epB[4] = { /* bw, bx, by, bz */ };
+
+		switch (mode)
+		{
+		case 0:
+			epG[2] |= bit.read( 1) <<  4;
+			epB[2] |= bit.read( 1) <<  4;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 5) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 1:
+			epG[2] |= bit.read( 1) <<  5;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[3] |= bit.read( 1) <<  5;
+			epR[0] |= bit.read( 7) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 7) <<  0;
+			epB[2] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  2;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 7) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			epB[3] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 6) <<  0;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 6) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 6) <<  0;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 6) <<  0;
+			epR[3] |= bit.read( 6) <<  0;
+			break;
+
+		case 2:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 5) <<  0;
+			epR[0] |= bit.read( 1) << 10;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 4) <<  0;
+			epG[0] |= bit.read( 1) << 10;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 4) <<  0;
+			epB[0] |= bit.read( 1) << 10;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 3:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read(10) <<  0;
+			epG[1] |= bit.read(10) <<  0;
+			epB[1] |= bit.read(10) <<  0;
+			break;
+
+		case 6:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 4) <<  0;
+			epR[0] |= bit.read( 1) << 10;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 5) <<  0;
+			epG[0] |= bit.read( 1) << 10;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 4) <<  0;
+			epB[0] |= bit.read( 1) << 10;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 4) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 4) <<  0;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 7:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 9) <<  0;
+			epR[0] |= bit.read( 1) << 10;
+			epG[1] |= bit.read( 9) <<  0;
+			epG[0] |= bit.read( 1) << 10;
+			epB[1] |= bit.read( 9) <<  0;
+			epB[0] |= bit.read( 1) << 10;
+			break;
+
+		case 10:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 4) <<  0;
+			epR[0] |= bit.read( 1) << 10;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 4) <<  0;
+			epG[0] |= bit.read( 1) << 10;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 5) <<  0;
+			epB[0] |= bit.read( 1) << 10;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 4) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 4) <<  0;
+			epB[3] |= bit.read( 1) <<  4;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 11:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 8) <<  0;
+			epR[0] |= bit.read( 1) << 11;
+			epR[0] |= bit.read( 1) << 10;
+			epG[1] |= bit.read( 8) <<  0;
+			epG[0] |= bit.read( 1) << 11;
+			epG[0] |= bit.read( 1) << 10;
+			epB[1] |= bit.read( 8) <<  0;
+			epB[0] |= bit.read( 1) << 11;
+			epB[0] |= bit.read( 1) << 10;
+			break;
+
+		case 14:
+			epR[0] |= bit.read( 9) <<  0;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 9) <<  0;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 9) <<  0;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 5) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 15:
+			epR[0] |= bit.read(10) <<  0;
+			epG[0] |= bit.read(10) <<  0;
+			epB[0] |= bit.read(10) <<  0;
+			epR[1] |= bit.read( 4) <<  0;
+			epR[0] |= bit.read( 1) << 15;
+			epR[0] |= bit.read( 1) << 14;
+			epR[0] |= bit.read( 1) << 13;
+			epR[0] |= bit.read( 1) << 12;
+			epR[0] |= bit.read( 1) << 11;
+			epR[0] |= bit.read( 1) << 10;
+			epG[1] |= bit.read( 4) <<  0;
+			epG[0] |= bit.read( 1) << 15;
+			epG[0] |= bit.read( 1) << 14;
+			epG[0] |= bit.read( 1) << 13;
+			epG[0] |= bit.read( 1) << 12;
+			epG[0] |= bit.read( 1) << 11;
+			epG[0] |= bit.read( 1) << 10;
+			epB[1] |= bit.read( 4) <<  0;
+			epB[0] |= bit.read( 1) << 15;
+			epB[0] |= bit.read( 1) << 14;
+			epB[0] |= bit.read( 1) << 13;
+			epB[0] |= bit.read( 1) << 12;
+			epB[0] |= bit.read( 1) << 11;
+			epB[0] |= bit.read( 1) << 10;
+			break;
+
+		case 18:
+			epR[0] |= bit.read( 8) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 8) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 8) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 6) <<  0;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 6) <<  0;
+			epR[3] |= bit.read( 6) <<  0;
+			break;
+
+		case 22:
+			epR[0] |= bit.read( 8) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 8) <<  0;
+			epG[2] |= bit.read( 1) <<  5;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 8) <<  0;
+			epG[3] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 5) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 6) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 26:
+			epR[0] |= bit.read( 8) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 8) <<  0;
+			epB[2] |= bit.read( 1) <<  5;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 8) <<  0;
+			epB[3] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 5) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 6) <<  0;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  2;
+			epR[3] |= bit.read( 5) <<  0;
+			epB[3] |= bit.read( 1) <<  3;
+			break;
+
+		case 30:
+			epR[0] |= bit.read( 6) <<  0;
+			epG[3] |= bit.read( 1) <<  4;
+			epB[3] |= bit.read( 1) <<  0;
+			epB[3] |= bit.read( 1) <<  1;
+			epB[2] |= bit.read( 1) <<  4;
+			epG[0] |= bit.read( 6) <<  0;
+			epG[2] |= bit.read( 1) <<  5;
+			epB[2] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  2;
+			epG[2] |= bit.read( 1) <<  4;
+			epB[0] |= bit.read( 6) <<  0;
+			epG[3] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  3;
+			epB[3] |= bit.read( 1) <<  5;
+			epB[3] |= bit.read( 1) <<  4;
+			epR[1] |= bit.read( 6) <<  0;
+			epG[2] |= bit.read( 4) <<  0;
+			epG[1] |= bit.read( 6) <<  0;
+			epG[3] |= bit.read( 4) <<  0;
+			epB[1] |= bit.read( 6) <<  0;
+			epB[2] |= bit.read( 4) <<  0;
+			epR[2] |= bit.read( 6) <<  0;
+			epR[3] |= bit.read( 6) <<  0;
+			break;
+
+		default:
+			break;
+		}
+
+		if (_signed)
+		{
+			epR[0] = signExtend(epR[0], mi.endpointBits);
+			epG[0] = signExtend(epG[0], mi.endpointBits);
+			epB[0] = signExtend(epB[0], mi.endpointBits);
+		}
+
+		const uint8_t numSubsets = !!mi.partitionBits + 1;
+
+		for (uint8_t ii = 1, num = numSubsets*2; ii < num; ++ii)
+		{
+			if (_signed
+			||  mi.transformed)
+			{
+				epR[ii] = signExtend(epR[ii], mi.deltaBits[0]);
+				epG[ii] = signExtend(epG[ii], mi.deltaBits[1]);
+				epB[ii] = signExtend(epB[ii], mi.deltaBits[2]);
+			}
+
+			if (mi.transformed)
+			{
+				const uint16_t mask = (1<<mi.endpointBits) - 1;
+
+				epR[ii] = (epR[ii] + epR[0]) & mask;
+				epG[ii] = (epG[ii] + epG[0]) & mask;
+				epB[ii] = (epB[ii] + epB[0]) & mask;
+
+				if (_signed)
+				{
+					epR[ii] = signExtend(epR[ii], mi.endpointBits);
+					epG[ii] = signExtend(epG[ii], mi.endpointBits);
+					epB[ii] = signExtend(epB[ii], mi.endpointBits);
+				}
+			}
+		}
+
+		for (uint8_t ii = 0, num = numSubsets*2; ii < num; ++ii)
+		{
+			epR[ii] = bc6hUnquantize(epR[ii], _signed, mi.endpointBits);
+			epG[ii] = bc6hUnquantize(epG[ii], _signed, mi.endpointBits);
+			epB[ii] = bc6hUnquantize(epB[ii], _signed, mi.endpointBits);
+		}
+
+		const uint8_t partitionSetIdx = uint8_t(mi.partitionBits ? bit.read(5) : 0);
+		const uint8_t indexBits = mi.partitionBits ? 3 : 4;
+		const uint8_t* factors  = s_bptcFactors[indexBits-2];
+
+		for (uint8_t yy = 0; yy < 4; ++yy)
+		{
+			for (uint8_t xx = 0; xx < 4; ++xx)
+			{
+				const uint8_t idx = yy*4+xx;
+
+				uint8_t subsetIndex = 0;
+				uint8_t indexAnchor = 0;
+
+				if (0 != mi.partitionBits)
+				{
+					subsetIndex = (s_bptcP2[partitionSetIdx] >> idx) & 1;
+					indexAnchor = subsetIndex ? s_bptcA2[partitionSetIdx] : 0;
+				}
+
+				const uint8_t anchor = idx == indexAnchor;
+				const uint8_t num    = indexBits - anchor;
+				const uint8_t index  = (uint8_t)bit.read(num);
+
+				const uint8_t fc  = factors[index];
+				const uint8_t fca = 64 - fc;
+				const uint8_t fcb = fc;
+
+				subsetIndex *= 2;
+				uint16_t rr = bc6hUnquantizeFinal( (epR[subsetIndex]*fca + epR[subsetIndex + 1]*fcb + 32) >> 6, _signed);
+				uint16_t gg = bc6hUnquantizeFinal( (epG[subsetIndex]*fca + epG[subsetIndex + 1]*fcb + 32) >> 6, _signed);
+				uint16_t bb = bc6hUnquantizeFinal( (epB[subsetIndex]*fca + epB[subsetIndex + 1]*fcb + 32) >> 6, _signed);
+
+				uint16_t* rgba = &_dst[idx*3];
+				rgba[0] = rr;
+				rgba[1] = gg;
+				rgba[2] = bb;
+			}
+		}
+	}
+
+	void decodeBlockBc6h(float _dst[16*4], const uint8_t _src[16])
+	{
+		uint16_t tmp[16*3];
+
+		decodeBlockBc6h(tmp, _src, true);
+
+		for (uint32_t ii = 0; ii < 16; ++ii)
+		{
+			_dst[ii*4+0] = bx::halfToFloat(tmp[ii*3+0]);
+			_dst[ii*4+1] = bx::halfToFloat(tmp[ii*3+1]);
+			_dst[ii*4+2] = bx::halfToFloat(tmp[ii*3+2]);
+			_dst[ii*4+3] = 1.0f;
+		}
+	}
+
+	struct Bc7ModeInfo
+	{
+		uint8_t numSubsets;
+		uint8_t partitionBits;
+		uint8_t rotationBits;
+		uint8_t indexSelectionBits;
+		uint8_t colorBits;
+		uint8_t alphaBits;
+		uint8_t endpointPBits;
+		uint8_t sharedPBits;
+		uint8_t indexBits[2];
+	};
+
+	static const Bc7ModeInfo s_bp7ModeInfo[] =
+	{ //  +---------------------------- num subsets
+	  //  |  +------------------------- partition bits
+	  //  |  |  +---------------------- rotation bits
+	  //  |  |  |  +------------------- index selection bits
+	  //  |  |  |  |  +---------------- color bits
+	  //  |  |  |  |  |  +------------- alpha bits
+	  //  |  |  |  |  |  |  +---------- endpoint P-bits
+	  //  |  |  |  |  |  |  |  +------- shared P-bits
+	  //  |  |  |  |  |  |  |  |    +-- 2x index bits
+		{ 3, 4, 0, 0, 4, 0, 1, 0, { 3, 0 } }, // 0
+		{ 2, 6, 0, 0, 6, 0, 0, 1, { 3, 0 } }, // 1
+		{ 3, 6, 0, 0, 5, 0, 0, 0, { 2, 0 } }, // 2
+		{ 2, 6, 0, 0, 7, 0, 1, 0, { 2, 0 } }, // 3
+		{ 1, 0, 2, 1, 5, 6, 0, 0, { 2, 3 } }, // 4
+		{ 1, 0, 2, 0, 7, 8, 0, 0, { 2, 2 } }, // 5
+		{ 1, 0, 0, 0, 7, 7, 1, 0, { 4, 0 } }, // 6
+		{ 2, 6, 0, 0, 5, 5, 1, 0, { 2, 0 } }, // 7
+	};
+
+	void decodeBlockBc7(uint8_t _dst[16*4], const uint8_t _src[16])
+	{
+		BitReader bit(_src);
+
+		uint8_t mode = 0;
+		for (; mode < 8 && 0 == bit.read(1); ++mode)
+		{
+		}
+
+		if (mode == 8)
+		{
+			bx::memSet(_dst, 0, 16*4);
+			return;
+		}
+
+		const Bc7ModeInfo& mi  = s_bp7ModeInfo[mode];
+		const uint8_t modePBits = 0 != mi.endpointPBits
+			? mi.endpointPBits
+			: mi.sharedPBits
+			;
+
+		const uint8_t partitionSetIdx    = uint8_t(bit.read(mi.partitionBits) );
+		const uint8_t rotationMode       = uint8_t(bit.read(mi.rotationBits) );
+		const uint8_t indexSelectionMode = uint8_t(bit.read(mi.indexSelectionBits) );
+
+		uint8_t epR[6];
+		uint8_t epG[6];
+		uint8_t epB[6];
+		uint8_t epA[6];
+
+		for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+		{
+			epR[ii*2+0] = uint8_t(bit.read(mi.colorBits) << modePBits);
+			epR[ii*2+1] = uint8_t(bit.read(mi.colorBits) << modePBits);
+		}
+
+		for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+		{
+			epG[ii*2+0] = uint8_t(bit.read(mi.colorBits) << modePBits);
+			epG[ii*2+1] = uint8_t(bit.read(mi.colorBits) << modePBits);
+		}
+
+		for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+		{
+			epB[ii*2+0] = uint8_t(bit.read(mi.colorBits) << modePBits);
+			epB[ii*2+1] = uint8_t(bit.read(mi.colorBits) << modePBits);
+		}
+
+		if (mi.alphaBits)
+		{
+			for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+			{
+				epA[ii*2+0] = uint8_t(bit.read(mi.alphaBits) << modePBits);
+				epA[ii*2+1] = uint8_t(bit.read(mi.alphaBits) << modePBits);
+			}
+		}
+		else
+		{
+			bx::memSet(epA, 0xff, 6);
+		}
+
+		if (0 != modePBits)
+		{
+			for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+			{
+				const uint8_t pda = uint8_t(                      bit.read(modePBits)      );
+				const uint8_t pdb = uint8_t(0 == mi.sharedPBits ? bit.read(modePBits) : pda);
+
+				epR[ii*2+0] |= pda;
+				epR[ii*2+1] |= pdb;
+				epG[ii*2+0] |= pda;
+				epG[ii*2+1] |= pdb;
+				epB[ii*2+0] |= pda;
+				epB[ii*2+1] |= pdb;
+				epA[ii*2+0] |= pda;
+				epA[ii*2+1] |= pdb;
+			}
+		}
+
+		const uint8_t colorBits = mi.colorBits + modePBits;
+
+		for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+		{
+			epR[ii*2+0] = bitRangeConvert(epR[ii*2+0], colorBits, 8);
+			epR[ii*2+1] = bitRangeConvert(epR[ii*2+1], colorBits, 8);
+			epG[ii*2+0] = bitRangeConvert(epG[ii*2+0], colorBits, 8);
+			epG[ii*2+1] = bitRangeConvert(epG[ii*2+1], colorBits, 8);
+			epB[ii*2+0] = bitRangeConvert(epB[ii*2+0], colorBits, 8);
+			epB[ii*2+1] = bitRangeConvert(epB[ii*2+1], colorBits, 8);
+		}
+
+		if (mi.alphaBits)
+		{
+			const uint8_t alphaBits = mi.alphaBits + modePBits;
+
+			for (uint8_t ii = 0; ii < mi.numSubsets; ++ii)
+			{
+				epA[ii*2+0] = bitRangeConvert(epA[ii*2+0], alphaBits, 8);
+				epA[ii*2+1] = bitRangeConvert(epA[ii*2+1], alphaBits, 8);
+			}
+		}
+
+		const bool hasIndexBits1 = 0 != mi.indexBits[1];
+
+		const uint8_t* factors[] =
+		{
+			                s_bptcFactors[mi.indexBits[0]-2],
+			hasIndexBits1 ? s_bptcFactors[mi.indexBits[1]-2] : factors[0],
+		};
+
+		uint16_t offset[2] =
+		{
+			0,
+			uint16_t(mi.numSubsets*(16*mi.indexBits[0]-1) ),
+		};
+
+		for (uint8_t yy = 0; yy < 4; ++yy)
+		{
+			for (uint8_t xx = 0; xx < 4; ++xx)
+			{
+				const uint8_t idx = yy*4+xx;
+
+				uint8_t subsetIndex = 0;
+				uint8_t indexAnchor = 0;
+				switch (mi.numSubsets)
+				{
+				case 2:
+					subsetIndex = (s_bptcP2[partitionSetIdx] >> idx) & 1;
+					indexAnchor = 0 != subsetIndex ? s_bptcA2[partitionSetIdx] : 0;
+					break;
+
+				case 3:
+					subsetIndex = (s_bptcP3[partitionSetIdx] >> (2*idx) ) & 3;
+					indexAnchor = 0 != subsetIndex ? s_bptcA3[subsetIndex-1][partitionSetIdx] : 0;
+					break;
+
+				default:
+					break;
+				}
+
+				const uint8_t anchor = idx == indexAnchor;
+				const uint8_t num[2] =
+				{
+					uint8_t(                mi.indexBits[0] - anchor    ),
+					uint8_t(hasIndexBits1 ? mi.indexBits[1] - anchor : 0),
+				};
+
+				const uint8_t index[2] =
+				{
+					                (uint8_t)bit.peek(offset[0], num[0]),
+					hasIndexBits1 ? (uint8_t)bit.peek(offset[1], num[1]) : index[0],
+				};
+
+				offset[0] += num[0];
+				offset[1] += num[1];
+
+				const uint8_t fc = factors[ indexSelectionMode][index[ indexSelectionMode] ];
+				const uint8_t fa = factors[!indexSelectionMode][index[!indexSelectionMode] ];
+
+				const uint8_t fca = 64 - fc;
+				const uint8_t fcb = fc;
+				const uint8_t faa = 64 - fa;
+				const uint8_t fab = fa;
+
+				subsetIndex *= 2;
+				uint8_t rr = uint8_t(uint16_t(epR[subsetIndex]*fca + epR[subsetIndex + 1]*fcb + 32) >> 6);
+				uint8_t gg = uint8_t(uint16_t(epG[subsetIndex]*fca + epG[subsetIndex + 1]*fcb + 32) >> 6);
+				uint8_t bb = uint8_t(uint16_t(epB[subsetIndex]*fca + epB[subsetIndex + 1]*fcb + 32) >> 6);
+				uint8_t aa = uint8_t(uint16_t(epA[subsetIndex]*faa + epA[subsetIndex + 1]*fab + 32) >> 6);
+
+				switch (rotationMode)
+				{
+				case 1: bx::xchg(aa, rr); break;
+				case 2: bx::xchg(aa, gg); break;
+				case 3: bx::xchg(aa, bb); break;
+				default:                  break;
+				};
+
+				uint8_t* bgra = &_dst[idx*4];
+				bgra[0] = bb;
+				bgra[1] = gg;
+				bgra[2] = rr;
+				bgra[3] = aa;
+			}
+		}
+	}
+
+	static const int32_t s_etc1Mod[8][4] =
+	{
+		{  2,   8,  -2,   -8 },
+		{  5,  17,  -5,  -17 },
+		{  9,  29,  -9,  -29 },
+		{ 13,  42, -13,  -42 },
+		{ 18,  60, -18,  -60 },
+		{ 24,  80, -24,  -80 },
+		{ 33, 106, -33, -106 },
+		{ 47, 183, -47, -183 },
+	};
+
+	static const uint8_t s_etc2Mod[] = { 3, 6, 11, 16, 23, 32, 41, 64 };
 
 	uint8_t uint8_sat(int32_t _a)
 	{
@@ -1893,10 +2891,10 @@ namespace bimg
 		const uint8_t numMips = _hasMips ? imageGetNumMips(_format, _width, _height, _depth) : 1;
 		uint32_t size = imageGetSize(NULL, _width, _height, _depth, _cubeMap, _hasMips, _numLayers, _format);
 
-		ImageContainer* imageContainer = (ImageContainer*)BX_ALLOC(_allocator, size + sizeof(ImageContainer) );
+		ImageContainer* imageContainer = (ImageContainer*)BX_ALIGNED_ALLOC(_allocator, size + BX_ALIGN_16(sizeof(ImageContainer) ), 16);
 
 		imageContainer->m_allocator   = _allocator;
-		imageContainer->m_data        = imageContainer + 1;
+		imageContainer->m_data        = bx::alignPtr(imageContainer + 1, 0, 16);
 		imageContainer->m_format      = _format;
 		imageContainer->m_orientation = Orientation::R0;
 		imageContainer->m_size        = size;
@@ -1922,7 +2920,7 @@ namespace bimg
 
 	void imageFree(ImageContainer* _imageContainer)
 	{
-		BX_FREE(_imageContainer->m_allocator, _imageContainer);
+		BX_ALIGNED_FREE(_imageContainer->m_allocator, _imageContainer, 16);
 	}
 
 // DDS
@@ -2911,18 +3909,18 @@ namespace bimg
 			{
 				uint32_t size = imageGetSize(NULL, uint16_t(_width), uint16_t(_height), 0, false, false, 1, TextureFormat::RGBA8);
 				void* temp = BX_ALLOC(_allocator, size);
-				imageDecodeToRgba8(temp, _src, _width, _height, _width*4, _srcFormat);
-				imageConvert(dst, TextureFormat::R8, temp, TextureFormat::RGBA8, _width, _height, 1, _width*4);
+				imageDecodeToRgba8(_allocator, temp, _src, _width, _height, _width*4, _srcFormat);
+				imageConvert(_allocator, dst, TextureFormat::R8, temp, TextureFormat::RGBA8, _width, _height, 1, _width*4);
 				BX_FREE(_allocator, temp);
 			}
 			else
 			{
-				imageConvert(dst, TextureFormat::R8, src, _srcFormat, _width, _height, 1, srcPitch);
+				imageConvert(_allocator, dst, TextureFormat::R8, src, _srcFormat, _width, _height, 1, srcPitch);
 			}
 		}
 	}
 
-	void imageDecodeToBgra8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _dstPitch, TextureFormat::Enum _srcFormat)
+	void imageDecodeToBgra8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _dstPitch, TextureFormat::Enum _srcFormat)
 	{
 		const uint8_t* src = (const uint8_t*)_src;
 		uint8_t* dst = (uint8_t*)_dst;
@@ -3034,6 +4032,40 @@ namespace bimg
 			}
 			break;
 
+		case TextureFormat::BC6H:
+			{
+				ImageContainer* rgba32f = imageAlloc(_allocator
+					, TextureFormat::RGBA32F
+					, uint16_t(_width)
+					, uint16_t(_height)
+					, uint16_t(1)
+					, 1
+					, false
+					, false
+					);
+				imageDecodeToRgba32f(_allocator, rgba32f->m_data, _src, _width, _height, 1, _width*16, _srcFormat);
+				imageConvert(_allocator, _dst, TextureFormat::BGRA8, rgba32f->m_data, TextureFormat::RGBA32F, _width, _height, 1, _width*16);
+				imageFree(rgba32f);
+			}
+			break;
+
+		case TextureFormat::BC7:
+			for (uint32_t yy = 0; yy < height; ++yy)
+			{
+				for (uint32_t xx = 0; xx < width; ++xx)
+				{
+					decodeBlockBc7(temp, src);
+					src += 16;
+
+					uint8_t* block = &dst[yy*_dstPitch*4 + xx*16];
+					bx::memCopy(&block[0*_dstPitch], &temp[ 0], 16);
+					bx::memCopy(&block[1*_dstPitch], &temp[16], 16);
+					bx::memCopy(&block[2*_dstPitch], &temp[32], 16);
+					bx::memCopy(&block[3*_dstPitch], &temp[48], 16);
+				}
+			}
+			break;
+
 		case TextureFormat::ETC1:
 		case TextureFormat::ETC2:
 			for (uint32_t yy = 0; yy < height; ++yy)
@@ -3133,7 +4165,7 @@ namespace bimg
 			{
 				const uint32_t srcBpp   = s_imageBlockInfo[_srcFormat].bitsPerPixel;
 				const uint32_t srcPitch = _width * srcBpp / 8;
-				if (!imageConvert(_dst, TextureFormat::BGRA8, _src, _srcFormat, _width, _height, 1, srcPitch) )
+				if (!imageConvert(_allocator, _dst, TextureFormat::BGRA8, _src, _srcFormat, _width, _height, 1, srcPitch) )
 				{
 					// Failed to convert, just make ugly red-yellow checkerboard texture.
 					imageCheckerboard(_dst, _width, _height, 16, UINT32_C(0xffff0000), UINT32_C(0xffffff00) );
@@ -3143,7 +4175,7 @@ namespace bimg
 		}
 	}
 
-	void imageDecodeToRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _dstPitch, TextureFormat::Enum _srcFormat)
+	void imageDecodeToRgba8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _dstPitch, TextureFormat::Enum _srcFormat)
 	{
 		switch (_srcFormat)
 		{
@@ -3165,7 +4197,7 @@ namespace bimg
 		default:
 			{
 				const uint32_t srcPitch = _width * 4;
-				imageDecodeToBgra8(_dst, _src, _width, _height, _dstPitch, _srcFormat);
+				imageDecodeToBgra8(_allocator, _dst, _src, _width, _height, _dstPitch, _srcFormat);
 				imageSwizzleBgra8(_dst, _dstPitch, _width, _height, _dst, srcPitch);
 			}
 			break;
@@ -3214,7 +4246,7 @@ namespace bimg
 		const uint8_t* src = (const uint8_t*)_src;
 
 		using namespace bx;
-		const simd128_t unpack = simd_ld(1.0f, 1.0f/256.0f, 1.0f/65536.0f, 1.0f/16777216.0f);
+		const simd128_t unpack = simd_ld(1.0f/256.0f, 1.0f/256.0f/256.0f, 1.0f/65536.0f/256.0f, 1.0f/16777216.0f/256.0f);
 		const simd128_t umask  = simd_ild(0xff, 0xff00, 0xff0000, 0xff000000);
 		const simd128_t wflip  = simd_ild(0, 0, 0, 0x80000000);
 		const simd128_t wadd   = simd_ld(0.0f, 0.0f, 0.0f, 32768.0f*65536.0f);
@@ -3284,6 +4316,31 @@ namespace bimg
 				}
 				break;
 
+			case TextureFormat::BC6H:
+				{
+					uint32_t width  = _width/4;
+					uint32_t height = _height/4;
+
+					const uint8_t* srcData = src;
+
+					for (uint32_t yy = 0; yy < height; ++yy)
+					{
+						for (uint32_t xx = 0; xx < width; ++xx)
+						{
+							float tmp[16*4];
+							decodeBlockBc6h(tmp, srcData);
+							srcData += 16;
+
+							uint8_t* block = (uint8_t*)&dst[yy*_dstPitch*4 + xx*64];
+							bx::memCopy(&block[0*_dstPitch], &tmp[ 0], 64);
+							bx::memCopy(&block[1*_dstPitch], &tmp[16], 64);
+							bx::memCopy(&block[2*_dstPitch], &tmp[32], 64);
+							bx::memCopy(&block[3*_dstPitch], &tmp[48], 64);
+						}
+					}
+				}
+				break;
+
 			case TextureFormat::RGBA32F:
 				bx::memCopy(dst, src, _dstPitch*_height);
 				break;
@@ -3293,13 +4350,13 @@ namespace bimg
 				{
 					uint32_t size = imageGetSize(NULL, uint16_t(_width), uint16_t(_height), 0, false, false, 1, TextureFormat::RGBA8);
 					void* temp = BX_ALLOC(_allocator, size);
-					imageDecodeToRgba8(temp, src, _width, _height, _width*4, _srcFormat);
+					imageDecodeToRgba8(_allocator, temp, src, _width, _height, _width*4, _srcFormat);
 					imageRgba8ToRgba32f(dst, _width, _height, _width*4, temp);
 					BX_FREE(_allocator, temp);
 				}
 				else
 				{
-					imageConvert(dst, TextureFormat::RGBA32F, src, _srcFormat, _width, _height, 1, srcPitch);
+					imageConvert(_allocator, dst, TextureFormat::RGBA32F, src, _srcFormat, _width, _height, 1, srcPitch);
 				}
 				break;
 			}
diff --git a/src/image_encode.cpp b/src/image_encode.cpp
index f7abf4c..3e29ef1 100644
--- a/src/image_encode.cpp
+++ b/src/image_encode.cpp
@@ -35,7 +35,7 @@ namespace bimg
 	};
 	BX_STATIC_ASSERT(Quality::Count == BX_COUNTOF(s_squishQuality) );
 
-	void imageEncodeFromRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _depth, TextureFormat::Enum _format, Quality::Enum _quality, bx::Error* _err)
+	void imageEncodeFromRgba8(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _depth, TextureFormat::Enum _format, Quality::Enum _quality, bx::Error* _err)
 	{
 		const uint8_t* src = (const uint8_t*)_src;
 		uint8_t* dst = (uint8_t*)_dst;
@@ -131,7 +131,7 @@ namespace bimg
 				break;
 
 			default:
-				if (!imageConvert(dst, _format, src, TextureFormat::RGBA8, _width, _height, 1) )
+				if (!imageConvert(_allocator, dst, _format, src, TextureFormat::RGBA8, _width, _height, 1) )
 				{
 					BX_ERROR_SET(_err, BIMG_ERROR, "Unable to convert between input/output formats!");
 				}
@@ -157,10 +157,10 @@ namespace bimg
 			break;
 
 		default:
-			if (!imageConvert(_dst, _dstFormat, _src, TextureFormat::RGBA32F, _width, _height, _depth) )
+			if (!imageConvert(_allocator, _dst, _dstFormat, _src, TextureFormat::RGBA32F, _width, _height, _depth) )
 			{
 				uint8_t* temp = (uint8_t*)BX_ALLOC(_allocator, _width*_height*_depth*4);
-				if (imageConvert(temp, TextureFormat::RGBA8, _src, TextureFormat::RGBA32F, _width, _height, _depth) )
+				if (imageConvert(_allocator, temp, TextureFormat::RGBA8, _src, TextureFormat::RGBA32F, _width, _height, _depth) )
 				{
 					for (uint32_t zz = 0; zz < _depth; ++zz)
 					{
@@ -183,7 +183,7 @@ namespace bimg
 						}
 					}
 
-					imageEncodeFromRgba8(_dst, temp, _width, _height, _depth, _dstFormat, _quality, _err);
+					imageEncodeFromRgba8(_allocator, _dst, temp, _width, _height, _depth, _dstFormat, _quality, _err);
 				}
 				else
 				{
@@ -211,8 +211,8 @@ namespace bimg
 			case bimg::TextureFormat::PTC14A:
 				{
 					uint8_t* temp = (uint8_t*)BX_ALLOC(_allocator, _width*_height*_depth*4);
-					imageDecodeToRgba8(temp, _src, _width, _height, _width*4, _srcFormat);
-					imageEncodeFromRgba8(_dst, temp, _width, _height, _depth, _dstFormat, _quality, _err);
+					imageDecodeToRgba8(_allocator, temp, _src, _width, _height, _width*4, _srcFormat);
+					imageEncodeFromRgba8(_allocator, _dst, temp, _width, _height, _depth, _dstFormat, _quality, _err);
 					BX_FREE(_allocator, temp);
 				}
 				break;
diff --git a/tools/texturec/texturec.cpp b/tools/texturec/texturec.cpp
index 972891c..58321cb 100644
--- a/tools/texturec/texturec.cpp
+++ b/tools/texturec/texturec.cpp
@@ -26,7 +26,7 @@
 #include <string>
 
 #define BIMG_TEXTUREC_VERSION_MAJOR 1
-#define BIMG_TEXTUREC_VERSION_MINOR 14
+#define BIMG_TEXTUREC_VERSION_MINOR 15
 
 struct Options
 {
@@ -145,8 +145,8 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 
 	if (NULL != input)
 	{
-		const bimg::TextureFormat::Enum inputFormat  = input->m_format;
-		      bimg::TextureFormat::Enum outputFormat = input->m_format;
+		bimg::TextureFormat::Enum inputFormat  = input->m_format;
+		bimg::TextureFormat::Enum outputFormat = input->m_format;
 
 		if (bimg::TextureFormat::Count != _options.format)
 		{
@@ -211,7 +211,7 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 
 		if (needResize)
 		{
-			bimg::ImageContainer* src = bimg::imageConvert(_allocator, bimg::TextureFormat::RGBA32F, *input);
+			bimg::ImageContainer* src = bimg::imageConvert(_allocator, bimg::TextureFormat::RGBA32F, *input, false);
 
 			bimg::ImageContainer* dst = bimg::imageAlloc(
 				  _allocator
@@ -229,6 +229,18 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 			bimg::imageFree(src);
 			bimg::imageFree(input);
 
+			if (bimg::isCompressed(inputFormat) )
+			{
+				if (inputFormat == bimg::TextureFormat::BC6H)
+				{
+					inputFormat = bimg::TextureFormat::RGBA32F;
+				}
+				else
+				{
+					inputFormat = bimg::TextureFormat::RGBA8;
+				}
+			}
+
 			input = bimg::imageConvert(_allocator, inputFormat, *dst);
 			bimg::imageFree(dst);
 		}
@@ -396,7 +408,7 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 					BX_FREE(_allocator, rgbaDst);
 				}
 				// HDR
-				else if ( (!bimg::isCompressed(input->m_format) && 8 != inputBlockInfo.rBits)
+				else if ( (!bimg::isCompressed(inputFormat) && 8 != inputBlockInfo.rBits)
 					 || outputFormat == bimg::TextureFormat::BC6H
 					 || outputFormat == bimg::TextureFormat::BC7
 						)
@@ -559,7 +571,9 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 					temp = BX_ALLOC(_allocator, size);
 					uint8_t* rgba = (uint8_t*)temp;
 
-					bimg::imageDecodeToRgba8(rgba
+					bimg::imageDecodeToRgba8(
+						  _allocator
+						, rgba
 						, mip.m_data
 						, mip.m_width
 						, mip.m_height
@@ -600,7 +614,9 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 					bimg::imageGetRawData(*output, side, 0, output->m_data, output->m_size, dstMip);
 					dstData = const_cast<uint8_t*>(dstMip.m_data);
 
-					bimg::imageEncodeFromRgba8(dstData
+					bimg::imageEncodeFromRgba8(
+						  _allocator
+						, dstData
 						, rgba
 						, dstMip.m_width
 						, dstMip.m_height
@@ -647,7 +663,9 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 						bimg::imageGetRawData(*output, side, lod, output->m_data, output->m_size, dstMip);
 						dstData = const_cast<uint8_t*>(dstMip.m_data);
 
-						bimg::imageEncodeFromRgba8(dstData
+						bimg::imageEncodeFromRgba8(
+							  _allocator
+							, dstData
 							, rgba
 							, dstMip.m_width
 							, dstMip.m_height
@@ -660,7 +678,9 @@ bimg::ImageContainer* convert(bx::AllocatorI* _allocator, const void* _inputData
 
 					if (NULL != ref)
 					{
-						bimg::imageDecodeToRgba8(rgba
+						bimg::imageDecodeToRgba8(
+							  _allocator
+							, rgba
 							, output->m_data
 							, mip.m_width
 							, mip.m_height
@@ -776,6 +796,30 @@ void help(const char* _str, const bx::Error& _err)
 	help(str.c_str(), false);
 }
 
+class AlignedAllocator : public bx::AllocatorI
+{
+public:
+	AlignedAllocator(bx::AllocatorI* _allocator, size_t _minAlignment)
+		: m_allocator(_allocator)
+		, m_minAlignment(_minAlignment)
+	{
+	}
+
+	virtual void* realloc(
+			void* _ptr
+		, size_t _size
+		, size_t _align
+		, const char* _file
+		, uint32_t _line
+		)
+	{
+		return m_allocator->realloc(_ptr, _size, bx::max(_align, m_minAlignment), _file, _line);
+	}
+
+	bx::AllocatorI* m_allocator;
+	size_t m_minAlignment;
+};
+
 int main(int _argc, const char* _argv[])
 {
 	bx::CommandLine cmdLine(_argc, _argv);
@@ -927,7 +971,9 @@ int main(int _argc, const char* _argv[])
 		return bx::kExitFailure;
 	}
 
-	bx::DefaultAllocator allocator;
+	bx::DefaultAllocator defaultAllocator;
+	AlignedAllocator allocator(&defaultAllocator, 16);
+
 	uint8_t* inputData = (uint8_t*)BX_ALLOC(&allocator, inputSize);
 
 	bx::read(&reader, inputData, inputSize, &err);