loader/webp: Support static webp loader

Built to libwebp code in to support the static option of webp loader. Code from :https://github.com/jacklicn/libwebp This forking version only contains the minimal webp decoding feature. Binary Size: +68kb Co-authored-by: Hermet Park <hermet@lottiefiles.com> issue: https://github.com/thorvg/thorvg/issues/1427
2025-07-24 23:28:57 +00:00 · 2024-02-06 12:09:29 +09:00 · 2024-02-06 12:09:29 +09:00 · df4f93e79f
commit df4f93e79f
parent 509181bf2a
60 changed files with 12630 additions and 40 deletions
--- a/.github/workflows/build_ubuntu.yml
+++ b/.github/workflows/build_ubuntu.yml
@ -87,7 +87,7 @@ jobs:
    - name: Build & Run memcheck Script(ASAN)
      run: |
        sudo rm -rf ./build
-        meson setup build -Db_sanitize="address,undefined" -Dloaders="all, webp_beta" -Dsavers="all" -Dtests="true" -Dbindings="capi"
+        meson setup build -Db_sanitize="address,undefined" -Dloaders="all" -Dsavers="all" -Dtests="true" -Dbindings="capi"
        sudo ninja -C build install
        export PATH=$PATH:~/.local/bin/
        chmod +x "${GITHUB_WORKSPACE}/.github/workflows/memcheck_asan.sh"
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -71,7 +71,7 @@ jobs:
        sudo pip3 install meson
        sudo apt-get install libturbojpeg0-dev libpng-dev libwebp-dev
-        meson . build -Dbindings="capi" -Dloaders="all, webp_beta" -Dsavers="all" -Dtools="all"
+        meson . build -Dbindings="capi" -Dloaders="all" -Dsavers="all" -Dtools="all"
        sudo ninja -C build install
--- a/meson.build
+++ b/meson.build
@ -62,7 +62,7 @@ if all_loaders or get_option('loaders').contains('ttf') == true
    config_h.set10('THORVG_TTF_LOADER_SUPPORT', true)
 endif
-if get_option('loaders').contains('webp_beta') == true
+if all_loaders or get_option('loaders').contains('webp') == true
    config_h.set10('THORVG_WEBP_LOADER_SUPPORT', true)
 endif
@ -140,15 +140,15 @@ Summary:
    Multi-Tasking:           @3@
    SIMD Instruction:        @4@
    Raster Engine (SW):      @5@
-    Raster Engine (GL):    @6@
+    Raster Engine (GL_BETA): @6@
-    Raster Engine (WG):    @7@
+    Raster Engine (WG_BETA): @7@
    Loader (TVG):            @8@
    Loader (SVG):            @9@
    Loader (TTF):            @10@
    Loader (LOTTIE):         @11@
    Loader (PNG):            @12@
    Loader (JPG):            @13@
-    Loader (WEBP_BETA):    @14@
+    Loader (WEBP):           @14@
    Saver (TVG):             @15@
    Saver (GIF):             @16@
    Binding (CAPI):          @17@
@ -175,7 +175,7 @@ Summary:
        all_loaders or get_option('loaders').contains('lottie'),
        all_loaders or get_option('loaders').contains('png'),
        all_loaders or get_option('loaders').contains('jpg'),
-        get_option('loaders').contains('webp_beta'),
+        all_loaders or get_option('loaders').contains('webp'),
        all_savers or get_option('savers').contains('tvg'),
        all_savers or get_option('savers').contains('gif'),
        get_option('bindings').contains('capi'),
--- a/meson_options.txt
+++ b/meson_options.txt
@ -6,9 +6,9 @@ option('engines',
 option('loaders',
   type: 'array',
-   choices: ['', 'tvg', 'svg', 'png', 'jpg', 'lottie', 'ttf', 'webp_beta', 'all'],
+   choices: ['', 'tvg', 'svg', 'png', 'jpg', 'lottie', 'ttf', 'webp', 'all'],
   value: ['svg', 'tvg', 'lottie'],
-   description: 'Enable File Loaders in thorvg ("all" does not include "*_beta".)')
+   description: 'Enable File Loaders in thorvg')
 option('savers',
   type: 'array',
--- a/src/examples/PictureWebp.cpp
+++ b/src/examples/PictureWebp.cpp
@ -75,7 +75,6 @@ void tvgDrawCmds(tvg::Canvas* canvas)
    picture->translate(400, 0);
    picture->scale(0.8);
    canvas->push(std::move(picture));
 }
--- a/src/loaders/external_webp/meson.build
+++ b/src/loaders/external_webp/meson.build
@ -3,10 +3,12 @@ source_file = [
   'tvgWebpLoader.cpp',
 ]
-webp_dep = dependency('libwebp', required: true)
+webp_dep = dependency('libwebp', required: false)
-subloader_dep += [declare_dependency(
+if webp_dep.found()
   subloader_dep += [declare_dependency(
      include_directories : include_directories('.'),
      dependencies : webp_dep,
      sources : source_file
      )]
 endif
--- a/src/loaders/meson.build
+++ b/src/loaders/meson.build
@ -38,11 +38,14 @@ if all_loaders or get_option('loaders').contains('jpg') == true
    endif
 endif
-if get_option('loaders').contains('webp_beta') == true
+if all_loaders or get_option('loaders').contains('webp') == true
    if get_option('static') == true
-        message('static webp is not available, disable webp_beta loader.')
+        subdir('webp')
    else
        subdir('external_webp')
        if not webp_dep.found()
            subdir('webp')
        endif
    endif
 endif
--- a/src/loaders/webp/dec/alpha.cpp
+++ b/src/loaders/webp/dec/alpha.cpp
@ -0,0 +1,167 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha-plane decompression.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "../dsp/dsp.h"
 #include "../utils/quant_levels_dec.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 //------------------------------------------------------------------------------
 // ALPHDecoder object.
 ALPHDecoder* ALPHNew(void) {
  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1ULL, sizeof(*dec));
  return dec;
 }
 void ALPHDelete(ALPHDecoder* const dec) {
  if (dec != NULL) {
    VP8LDelete(dec->vp8l_dec_);
    dec->vp8l_dec_ = NULL;
    free(dec);
  }
 }
 //------------------------------------------------------------------------------
 // Decoding.
 // Initialize alpha decoding by parsing the alpha header and decoding the image
 // header for alpha data stored using lossless compression.
 // Returns false in case of error in alpha header (data too short, invalid
 // compression method or filter, error in lossless header data etc).
 static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
                    size_t data_size, int width, int height, uint8_t* output) {
  int ok = 0;
  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
  int rsrv;
  assert(width > 0 && height > 0);
  assert(data != NULL && output != NULL);
  dec->width_ = width;
  dec->height_ = height;
  if (data_size <= ALPHA_HEADER_LEN) {
    return 0;
  }
  dec->method_ = (data[0] >> 0) & 0x03;
  dec->filter_ = static_cast<WEBP_FILTER_TYPE>((data[0] >> 2) & 0x03);
  dec->pre_processing_ = (data[0] >> 4) & 0x03;
  rsrv = (data[0] >> 6) & 0x03;
  if (dec->method_ < ALPHA_NO_COMPRESSION ||
      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
      dec->filter_ >= WEBP_FILTER_LAST ||
      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
      rsrv != 0) {
    return 0;
  }
  if (dec->method_ == ALPHA_NO_COMPRESSION) {
    const size_t alpha_decoded_size = dec->width_ * dec->height_;
    ok = (alpha_data_size >= alpha_decoded_size);
  } else {
    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
  }
  VP8FiltersInit();
  return ok;
 }
 // Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
 // starting from row number 'row'. It assumes that rows up to (row - 1) have
 // already been decoded.
 // Returns false in case of bitstream error.
 static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
  ALPHDecoder* const alph_dec = dec->alph_dec_;
  const int width = alph_dec->width_;
  const int height = alph_dec->height_;
  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
  uint8_t* const output = dec->alpha_plane_;
  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
    const size_t offset = row * width;
    const size_t num_pixels = num_rows * width;
    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
    memcpy(dec->alpha_plane_ + offset,
           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
    assert(alph_dec->vp8l_dec_ != NULL);
    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
      return 0;
    }
  }
  if (unfilter_func != NULL) {
    unfilter_func(width, height, width, row, num_rows, output);
  }
  if (row + num_rows == dec->pic_hdr_.height_) {
    dec->is_alpha_decoded_ = 1;
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 // Main entry point.
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows) {
  const int width = dec->pic_hdr_.width_;
  const int height = dec->pic_hdr_.height_;
  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
    return NULL;    // sanity check.
  }
  if (row == 0) {
    // Initialize decoding.
    assert(dec->alpha_plane_ != NULL);
    dec->alph_dec_ = ALPHNew();
    if (dec->alph_dec_ == NULL) return NULL;
    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
                  width, height, dec->alpha_plane_)) {
      ALPHDelete(dec->alph_dec_);
      dec->alph_dec_ = NULL;
      return NULL;
    }
    // if we allowed use of alpha dithering, check whether it's needed at all
    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
      dec->alpha_dithering_ = 0;  // disable dithering
    } else {
      num_rows = height;          // decode everything in one pass
    }
  }
  if (!dec->is_alpha_decoded_) {
    int ok = 0;
    assert(dec->alph_dec_ != NULL);
    ok = ALPHDecode(dec, row, num_rows);
    if (ok && dec->alpha_dithering_ > 0) {
      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
                                dec->alpha_dithering_);
    }
    if (!ok || dec->is_alpha_decoded_) {
      ALPHDelete(dec->alph_dec_);
      dec->alph_dec_ = NULL;
    }
    if (!ok) return NULL;  // Error.
  }
  // Return a pointer to the current decoded row.
  return dec->alpha_plane_ + row * width;
 }
--- a/src/loaders/webp/dec/alphai.h
+++ b/src/loaders/webp/dec/alphai.h
@ -0,0 +1,55 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha decoder: internal header.
 //
 // Author: Urvang (urvang@google.com)
 #ifndef WEBP_DEC_ALPHAI_H_
 #define WEBP_DEC_ALPHAI_H_
 #include "./webpi.h"
 #include "../dsp/dsp.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct VP8LDecoder;  // Defined in dec/vp8li.h.
 typedef struct ALPHDecoder ALPHDecoder;
 struct ALPHDecoder {
  int width_;
  int height_;
  int method_;
  WEBP_FILTER_TYPE filter_;
  int pre_processing_;
  struct VP8LDecoder* vp8l_dec_;
  VP8Io io_;
  int use_8b_decode;  // Although alpha channel requires only 1 byte per
                      // pixel, sometimes VP8LDecoder may need to allocate
                      // 4 bytes per pixel internally during decode.
 };
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 // Allocates a new alpha decoder instance.
 ALPHDecoder* ALPHNew(void);
 // Clears and deallocates an alpha decoder instance.
 void ALPHDelete(ALPHDecoder* const dec);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DEC_ALPHAI_H_ */
--- a/src/loaders/webp/dec/buffer.cpp
+++ b/src/loaders/webp/dec/buffer.cpp
@ -0,0 +1,238 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Everything about WebPDecBuffer
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./vp8i.h"
 #include "./webpi.h"
 #include "../utils/utils.h"
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 // Number of bytes per pixel for the different color-spaces.
 static const int kModeBpp[MODE_LAST] = {
  3, 4, 3, 4, 4, 2, 2,
  4, 4, 4, 2,    // pre-multiplied modes
  1, 1 };
 // Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
 // Convert to an integer to handle both the unsigned/signed enum cases
 // without the need for casting to remove type limit warnings.
 static int IsValidColorspace(int webp_csp_mode) {
  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
 }
 static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
  int ok = 1;
  const WEBP_CSP_MODE mode = buffer->colorspace;
  const int width = buffer->width;
  const int height = buffer->height;
  if (!IsValidColorspace(mode)) {
    ok = 0;
  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
    const int y_stride = abs(buf->y_stride);
    const int u_stride = abs(buf->u_stride);
    const int v_stride = abs(buf->v_stride);
    const int a_stride = abs(buf->a_stride);
    const uint64_t y_size = (uint64_t)y_stride * height;
    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
    const uint64_t a_size = (uint64_t)a_stride * height;
    ok &= (y_size <= buf->y_size);
    ok &= (u_size <= buf->u_size);
    ok &= (v_size <= buf->v_size);
    ok &= (y_stride >= width);
    ok &= (u_stride >= (width + 1) / 2);
    ok &= (v_stride >= (width + 1) / 2);
    ok &= (buf->y != NULL);
    ok &= (buf->u != NULL);
    ok &= (buf->v != NULL);
    if (mode == MODE_YUVA) {
      ok &= (a_stride >= width);
      ok &= (a_size <= buf->a_size);
      ok &= (buf->a != NULL);
    }
  } else {    // RGB checks
    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
    const int stride = abs(buf->stride);
    const uint64_t size = (uint64_t)stride * height;
    ok &= (size <= buf->size);
    ok &= (stride >= width * kModeBpp[mode]);
    ok &= (buf->rgba != NULL);
  }
  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
 }
 static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
  const int w = buffer->width;
  const int h = buffer->height;
  const WEBP_CSP_MODE mode = buffer->colorspace;
  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
    uint8_t* output;
    int uv_stride = 0, a_stride = 0;
    uint64_t uv_size = 0, a_size = 0, total_size;
    // We need memory and it hasn't been allocated yet.
    // => initialize output buffer, now that dimensions are known.
    const int stride = w * kModeBpp[mode];
    const uint64_t size = (uint64_t)stride * h;
    if (!WebPIsRGBMode(mode)) {
      uv_stride = (w + 1) / 2;
      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
      if (mode == MODE_YUVA) {
        a_stride = w;
        a_size = (uint64_t)a_stride * h;
      }
    }
    total_size = size + 2 * uv_size + a_size;
    // Security/sanity checks
    output = (uint8_t*)malloc(total_size * sizeof(*output));
    if (output == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    buffer->private_memory = output;
    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
      WebPYUVABuffer* const buf = &buffer->u.YUVA;
      buf->y = output;
      buf->y_stride = stride;
      buf->y_size = (size_t)size;
      buf->u = output + size;
      buf->u_stride = uv_stride;
      buf->u_size = (size_t)uv_size;
      buf->v = output + size + uv_size;
      buf->v_stride = uv_stride;
      buf->v_size = (size_t)uv_size;
      if (mode == MODE_YUVA) {
        buf->a = output + size + 2 * uv_size;
      }
      buf->a_size = (size_t)a_size;
      buf->a_stride = a_stride;
    } else {  // RGBA initialization
      WebPRGBABuffer* const buf = &buffer->u.RGBA;
      buf->rgba = output;
      buf->stride = stride;
      buf->size = (size_t)size;
    }
  }
  return CheckDecBuffer(buffer);
 }
 VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
  if (buffer == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (WebPIsRGBMode(buffer->colorspace)) {
    WebPRGBABuffer* const buf = &buffer->u.RGBA;
    buf->rgba += (buffer->height - 1) * buf->stride;
    buf->stride = -buf->stride;
  } else {
    WebPYUVABuffer* const buf = &buffer->u.YUVA;
    const int H = buffer->height;
    buf->y += (H - 1) * buf->y_stride;
    buf->y_stride = -buf->y_stride;
    buf->u += ((H - 1) >> 1) * buf->u_stride;
    buf->u_stride = -buf->u_stride;
    buf->v += ((H - 1) >> 1) * buf->v_stride;
    buf->v_stride = -buf->v_stride;
    if (buf->a != NULL) {
      buf->a += (H - 1) * buf->a_stride;
      buf->a_stride = -buf->a_stride;
    }
  }
  return VP8_STATUS_OK;
 }
 VP8StatusCode WebPAllocateDecBuffer(int w, int h,
                                    const WebPDecoderOptions* const options,
                                    WebPDecBuffer* const out) {
  VP8StatusCode status;
  if (out == NULL || w <= 0 || h <= 0) {
    return VP8_STATUS_INVALID_PARAM;
  }
  if (options != NULL) {    // First, apply options if there is any.
    if (options->use_cropping) {
      const int cw = options->crop_width;
      const int ch = options->crop_height;
      const int x = options->crop_left & ~1;
      const int y = options->crop_top & ~1;
      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
      }
      w = cw;
      h = ch;
    }
    if (options->use_scaling) {
      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
        return VP8_STATUS_INVALID_PARAM;
      }
      w = options->scaled_width;
      h = options->scaled_height;
    }
  }
  out->width = w;
  out->height = h;
  // Then, allocate buffer for real.
  status = AllocateBuffer(out);
  if (status != VP8_STATUS_OK) return status;
  // Use the stride trick if vertical flip is needed.
  if (options != NULL && options->flip) {
    status = WebPFlipBuffer(out);
  }
  return status;
 }
 //------------------------------------------------------------------------------
 // constructors / destructors
 int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return 0;  // version mismatch
  }
  if (buffer == NULL) return 0;
  memset(buffer, 0, sizeof(*buffer));
  return 1;
 }
 void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
  if (buffer != NULL) {
    if (!buffer->is_external_memory) {
      free(buffer->private_memory);
    }
    buffer->private_memory = NULL;
  }
 }
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
                       WebPDecBuffer* const dst) {
  if (src != NULL && dst != NULL) {
    *dst = *src;
    if (src->private_memory != NULL) {
      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
      dst->private_memory = NULL;
    }
  }
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dec/common.h
+++ b/src/loaders/webp/dec/common.h
@ -0,0 +1,54 @@
 // Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Definitions and macros common to encoding and decoding
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_DEC_COMMON_H_
 #define WEBP_DEC_COMMON_H_
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
       B_TM_PRED = 1,
       B_VE_PRED = 2,
       B_HE_PRED = 3,
       B_RD_PRED = 4,
       B_VR_PRED = 5,
       B_LD_PRED = 6,
       B_VL_PRED = 7,
       B_HD_PRED = 8,
       B_HU_PRED = 9,
       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
       // Luma16 or UV modes
       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
       B_PRED = NUM_BMODES,   // refined I4x4 mode
       NUM_PRED_MODES = 4,
       // special modes
       B_DC_PRED_NOTOP = 4,
       B_DC_PRED_NOLEFT = 5,
       B_DC_PRED_NOTOPLEFT = 6,
       NUM_B_DC_MODES = 7 };
 enum { MB_FEATURE_TREE_PROBS = 3,
       NUM_MB_SEGMENTS = 4,
       NUM_REF_LF_DELTAS = 4,
       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
       MAX_NUM_PARTITIONS = 8,
       // Probabilities
       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
       NUM_BANDS = 8,
       NUM_CTX = 3,
       NUM_PROBAS = 11
     };
 #endif    // WEBP_DEC_COMMON_H_
--- a/src/loaders/webp/dec/decode_vp8.h
+++ b/src/loaders/webp/dec/decode_vp8.h
@ -0,0 +1,177 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Low-level API for VP8 decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_WEBP_DECODE_VP8_H_
 #define WEBP_WEBP_DECODE_VP8_H_
 #include "../webp/decode.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Lower-level API
 //
 // These functions provide fine-grained control of the decoding process.
 // The call flow should resemble:
 //
 //   VP8Io io;
 //   VP8InitIo(&io);
 //   io.data = data;
 //   io.data_size = size;
 //   /* customize io's functions (setup()/put()/teardown()) if needed. */
 //
 //   VP8Decoder* dec = VP8New();
 //   bool ok = VP8Decode(dec);
 //   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
 //   VP8Delete(dec);
 //   return ok;
 // Input / Output
 typedef struct VP8Io VP8Io;
 typedef int (*VP8IoPutHook)(const VP8Io* io);
 typedef int (*VP8IoSetupHook)(VP8Io* io);
 typedef void (*VP8IoTeardownHook)(const VP8Io* io);
 struct VP8Io {
  // set by VP8GetHeaders()
  int width, height;         // picture dimensions, in pixels (invariable).
                             // These are the original, uncropped dimensions.
                             // The actual area passed to put() is stored
                             // in mb_w / mb_h fields.
  // set before calling put()
  int mb_y;                  // position of the current rows (in pixels)
  int mb_w;                  // number of columns in the sample
  int mb_h;                  // number of rows in the sample
  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
  int y_stride;              // row stride for luma
  int uv_stride;             // row stride for chroma
  void* opaque;              // user data
  // called when fresh samples are available. Currently, samples are in
  // YUV420 format, and can be up to width x 24 in size (depending on the
  // in-loop filtering level, e.g.). Should return false in case of error
  // or abort request. The actual size of the area to update is mb_w x mb_h
  // in size, taking cropping into account.
  VP8IoPutHook put;
  // called just before starting to decode the blocks.
  // Must return false in case of setup error, true otherwise. If false is
  // returned, teardown() will NOT be called. But if the setup succeeded
  // and true is returned, then teardown() will always be called afterward.
  VP8IoSetupHook setup;
  // Called just after block decoding is finished (or when an error occurred
  // during put()). Is NOT called if setup() failed.
  VP8IoTeardownHook teardown;
  // this is a recommendation for the user-side yuv->rgb converter. This flag
  // is set when calling setup() hook and can be overwritten by it. It then
  // can be taken into consideration during the put() method.
  int fancy_upsampling;
  // Input buffer.
  size_t data_size;
  const uint8_t* data;
  // If true, in-loop filtering will not be performed even if present in the
  // bitstream. Switching off filtering may speed up decoding at the expense
  // of more visible blocking. Note that output will also be non-compliant
  // with the VP8 specifications.
  int bypass_filtering;
  // Cropping parameters.
  int use_cropping;
  int crop_left, crop_right, crop_top, crop_bottom;
  // Scaling parameters.
  int use_scaling;
  int scaled_width, scaled_height;
  // If non NULL, pointer to the alpha data (if present) corresponding to the
  // start of the current row (That is: it is pre-offset by mb_y and takes
  // cropping into account).
  const uint8_t* a;
 };
 // Internal, version-checked, entry point
 int VP8InitIoInternal(VP8Io* const, int);
 // Main decoding object. This is an opaque structure.
 typedef struct VP8Decoder VP8Decoder;
 // Create a new decoder object.
 VP8Decoder* VP8New(void);
 // Must be called to make sure 'io' is initialized properly.
 // Returns false in case of version mismatch. Upon such failure, no other
 // decoding function should be called (VP8Decode, VP8GetHeaders, ...)
 static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
 // Decode the VP8 frame header. Returns true if ok.
 // Note: 'io->data' must be pointing to the start of the VP8 frame header.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
 // Returns false in case of error.
 int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
 // Return current status of the decoder:
 VP8StatusCode VP8Status(VP8Decoder* const dec);
 // return readable string corresponding to the last status.
 const char* VP8StatusMessage(VP8Decoder* const dec);
 // Resets the decoder in its initial state, reclaiming memory.
 // Not a mandatory call between calls to VP8Decode().
 void VP8Clear(VP8Decoder* const dec);
 // Destroy the decoder object.
 void VP8Delete(VP8Decoder* const dec);
 //------------------------------------------------------------------------------
 // Miscellaneous VP8/VP8L bitstream probing functions.
 // Returns true if the next 3 bytes in data contain the VP8 signature.
 WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
 // Validates the VP8 data-header and retrieves basic header information viz
 // width and height. Returns 0 in case of formatting error. *width/*height
 // can be passed NULL.
 WEBP_EXTERN(int) VP8GetInfo(
    const uint8_t* data,
    size_t data_size,    // data available so far
    size_t chunk_size,   // total data size expected in the chunk
    int* const width, int* const height);
 // Returns true if the next byte(s) in data is a VP8L signature.
 WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
 // Validates the VP8L data-header and retrieves basic header information viz
 // width, height and alpha. Returns 0 in case of formatting error.
 // width/height/has_alpha can be passed NULL.
 WEBP_EXTERN(int) VP8LGetInfo(
    const uint8_t* data, size_t data_size,  // data available so far
    int* const width, int* const height, int* const has_alpha);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_WEBP_DECODE_VP8_H_ */
--- a/src/loaders/webp/dec/frame.cpp
+++ b/src/loaders/webp/dec/frame.cpp
@ -0,0 +1,713 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Frame-reconstruction function. Memory allocation.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./vp8i.h"
 #include "../utils/utils.h"
 #define ALIGN_CST (32 - 1)
 #define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
 //------------------------------------------------------------------------------
 // Main reconstruction function.
 static const int kScan[16] = {
  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };
 static int CheckMode(int mb_x, int mb_y, int mode) {
  if (mode == B_DC_PRED) {
    if (mb_x == 0) {
      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
    } else {
      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
    }
  }
  return mode;
 }
 static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
  memcpy(dst, src, 4);
 }
 static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
                                    uint8_t* const dst) {
  switch (bits >> 30) {
    case 3:
      VP8Transform(src, dst, 0);
      break;
    case 2:
      VP8TransformAC3(src, dst);
      break;
    case 1:
      VP8TransformDC(src, dst);
      break;
    default:
      break;
  }
 }
 static void DoUVTransform(uint32_t bits, const int16_t* const src,
                          uint8_t* const dst) {
  if (bits & 0xff) {    // any non-zero coeff at all?
    if (bits & 0xaa) {  // any non-zero AC coefficient?
      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
    } else {
      VP8TransformDCUV(src, dst);
    }
  }
 }
 static void ReconstructRow(const VP8Decoder* const dec,
                           const VP8ThreadContext* ctx) {
  int j;
  int mb_x;
  const int mb_y = ctx->mb_y_;
  const int cache_id = ctx->id_;
  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
  // Initialize left-most block.
  for (j = 0; j < 16; ++j) {
    y_dst[j * BPS - 1] = 129;
  }
  for (j = 0; j < 8; ++j) {
    u_dst[j * BPS - 1] = 129;
    v_dst[j * BPS - 1] = 129;
  }
  // Init top-left sample on left column too.
  if (mb_y > 0) {
    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
  } else {
    // we only need to do this init once at block (0,0).
    // Afterward, it remains valid for the whole topmost row.
    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
    memset(u_dst - BPS - 1, 127, 8 + 1);
    memset(v_dst - BPS - 1, 127, 8 + 1);
  }
  // Reconstruct one row.
  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
    const VP8MBData* const block = ctx->mb_data_ + mb_x;
    // Rotate in the left samples from previously decoded block. We move four
    // pixels at a time for alignment reason, and because of in-loop filter.
    if (mb_x > 0) {
      for (j = -1; j < 16; ++j) {
        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
      }
      for (j = -1; j < 8; ++j) {
        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
      }
    }
    {
      // bring top samples into the cache
      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
      const int16_t* const coeffs = block->coeffs_;
      uint32_t bits = block->non_zero_y_;
      int n;
      if (mb_y > 0) {
        memcpy(y_dst - BPS, top_yuv[0].y, 16);
        memcpy(u_dst - BPS, top_yuv[0].u, 8);
        memcpy(v_dst - BPS, top_yuv[0].v, 8);
      }
      // predict and add residuals
      if (block->is_i4x4_) {   // 4x4
        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
        if (mb_y > 0) {
          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
          } else {
            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
          }
        }
        // replicate the top-right pixels below
        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
        // predict and add residuals for all 4x4 blocks in turn.
        for (n = 0; n < 16; ++n, bits <<= 2) {
          uint8_t* const dst = y_dst + kScan[n];
          VP8PredLuma4[block->imodes_[n]](dst);
          DoTransform(bits, coeffs + n * 16, dst);
        }
      } else {    // 16x16
        const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
        VP8PredLuma16[pred_func](y_dst);
        if (bits != 0) {
          for (n = 0; n < 16; ++n, bits <<= 2) {
            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
          }
        }
      }
      {
        // Chroma
        const uint32_t bits_uv = block->non_zero_uv_;
        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
        VP8PredChroma8[pred_func](u_dst);
        VP8PredChroma8[pred_func](v_dst);
        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
      }
      // stash away top samples for next block
      if (mb_y < dec->mb_h_ - 1) {
        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
      }
    }
    // Transfer reconstructed samples from yuv_b_ cache to final destination.
    {
      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
      for (j = 0; j < 16; ++j) {
        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
      }
      for (j = 0; j < 8; ++j) {
        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
      }
    }
  }
 }
 //------------------------------------------------------------------------------
 // Filtering
 // kFilterExtraRows[] = How many extra lines are needed on the MB boundary
 // for caching, given a filtering level.
 // Simple filter:  up to 2 luma samples are read and 1 is written.
 // Complex filter: up to 4 luma samples are read and 3 are written. Same for
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
  const int y_bps = dec->cache_y_stride_;
  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
  const int ilevel = f_info->f_ilevel_;
  const int limit = f_info->f_limit_;
  if (limit == 0) {
    return;
  }
  assert(limit >= 3);
  if (dec->filter_type_ == 1) {   // simple
    if (mb_x > 0) {
      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
    }
    if (f_info->f_inner_) {
      VP8SimpleHFilter16i(y_dst, y_bps, limit);
    }
    if (mb_y > 0) {
      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
    }
    if (f_info->f_inner_) {
      VP8SimpleVFilter16i(y_dst, y_bps, limit);
    }
  } else {    // complex
    const int uv_bps = dec->cache_uv_stride_;
    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
    const int hev_thresh = f_info->hev_thresh_;
    if (mb_x > 0) {
      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    }
    if (f_info->f_inner_) {
      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    }
    if (mb_y > 0) {
      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
    }
    if (f_info->f_inner_) {
      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
    }
  }
 }
 // Filter the decoded macroblock row (if needed)
 static void FilterRow(const VP8Decoder* const dec) {
  int mb_x;
  const int mb_y = dec->thread_ctx_.mb_y_;
  assert(dec->thread_ctx_.filter_row_);
  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
    DoFilter(dec, mb_x, mb_y);
  }
 }
 //------------------------------------------------------------------------------
 // Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
 static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
  if (dec->filter_type_ > 0) {
    int s;
    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
      int i4x4;
      // First, compute the initial level
      int base_level;
      if (dec->segment_hdr_.use_segment_) {
        base_level = dec->segment_hdr_.filter_strength_[s];
        if (!dec->segment_hdr_.absolute_delta_) {
          base_level += hdr->level_;
        }
      } else {
        base_level = hdr->level_;
      }
      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
        int level = base_level;
        if (hdr->use_lf_delta_) {
          level += hdr->ref_lf_delta_[0];
          if (i4x4) {
            level += hdr->mode_lf_delta_[0];
          }
        }
        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
        if (level > 0) {
          int ilevel = level;
          if (hdr->sharpness_ > 0) {
            if (hdr->sharpness_ > 4) {
              ilevel >>= 2;
            } else {
              ilevel >>= 1;
            }
            if (ilevel > 9 - hdr->sharpness_) {
              ilevel = 9 - hdr->sharpness_;
            }
          }
          if (ilevel < 1) ilevel = 1;
          info->f_ilevel_ = ilevel;
          info->f_limit_ = 2 * level + ilevel;
          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
        } else {
          info->f_limit_ = 0;  // no filtering
        }
        info->f_inner_ = i4x4;
      }
    }
  }
 }
 //------------------------------------------------------------------------------
 // Dithering
 #define DITHER_AMP_TAB_SIZE 12
 static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
  // roughly, it's dqm->uv_mat_[1]
  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
 };
 void VP8InitDithering(const WebPDecoderOptions* const options,
                      VP8Decoder* const dec) {
  assert(dec != NULL);
  if (options != NULL) {
    const int d = options->dithering_strength;
    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
    if (f > 0) {
      int s;
      int all_amp = 0;
      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
        VP8QuantMatrix* const dqm = &dec->dqm_[s];
        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
          // TODO(skal): should we specially dither more for uv_quant_ < 0?
          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
        }
        all_amp |= dqm->dither_;
      }
      if (all_amp != 0) {
        VP8InitRandom(&dec->dithering_rg_, 1.0f);
        dec->dither_ = 1;
      }
    }
    // potentially allow alpha dithering
    dec->alpha_dithering_ = options->alpha_dithering_strength;
    if (dec->alpha_dithering_ > 100) {
      dec->alpha_dithering_ = 100;
    } else if (dec->alpha_dithering_ < 0) {
      dec->alpha_dithering_ = 0;
    }
  }
 }
 // minimal amp that will provide a non-zero dithering effect
 #define MIN_DITHER_AMP 4
 #define DITHER_DESCALE 4
 #define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
 #define DITHER_AMP_BITS 8
 #define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
 static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
  int i, j;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) {
      // TODO: could be made faster with SSE2
      const int bits =
          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
      const int v = (int)dst[i] + delta;
      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
    }
    dst += bps;
  }
 }
 static void DitherRow(VP8Decoder* const dec) {
  int mb_x;
  assert(dec->dither_);
  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
    const VP8MBData* const data = ctx->mb_data_ + mb_x;
    const int cache_id = ctx->id_;
    const int uv_bps = dec->cache_uv_stride_;
    if (data->dither_ >= MIN_DITHER_AMP) {
      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
    }
  }
 }
 //------------------------------------------------------------------------------
 // This function is called after a row of macroblocks is finished decoding.
 // It also takes into account the following restrictions:
 //  * In case of in-loop filtering, we must hold off sending some of the bottom
 //    pixels as they are yet unfiltered. They will be when the next macroblock
 //    row is decoded. Meanwhile, we must preserve them by rotating them in the
 //    cache area. This doesn't hold for the very bottom row of the uncropped
 //    picture of course.
 //  * we must clip the remaining pixels against the cropping area. The VP8Io
 //    struct must have the following fields set correctly before calling put():
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
 // Finalize and transmit a complete row. Return false in case of user-abort.
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int cache_id = ctx->id_;
  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
  const int ysize = extra_y_rows * dec->cache_y_stride_;
  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
  const int mb_y = ctx->mb_y_;
  const int is_first_row = (mb_y == 0);
  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
  if (ctx->filter_row_) {
    FilterRow(dec);
  }
  if (dec->dither_) {
    DitherRow(dec);
  }
  if (io->put != NULL) {
    int y_start = MACROBLOCK_VPOS(mb_y);
    int y_end = MACROBLOCK_VPOS(mb_y + 1);
    if (!is_first_row) {
      y_start -= extra_y_rows;
      io->y = ydst;
      io->u = udst;
      io->v = vdst;
    } else {
      io->y = dec->cache_y_ + y_offset;
      io->u = dec->cache_u_ + uv_offset;
      io->v = dec->cache_v_ + uv_offset;
    }
    if (!is_last_row) {
      y_end -= extra_y_rows;
    }
    if (y_end > io->crop_bottom) {
      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
    }
    io->a = NULL;
    if (dec->alpha_data_ != NULL && y_start < y_end) {
      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
      // good idea.
      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
      if (io->a == NULL) {
        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                           "Could not decode alpha data.");
      }
    }
    if (y_start < io->crop_top) {
      const int delta_y = io->crop_top - y_start;
      y_start = io->crop_top;
      assert(!(delta_y & 1));
      io->y += dec->cache_y_stride_ * delta_y;
      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
      if (io->a != NULL) {
        io->a += io->width * delta_y;
      }
    }
    if (y_start < y_end) {
      io->y += io->crop_left;
      io->u += io->crop_left >> 1;
      io->v += io->crop_left >> 1;
      if (io->a != NULL) {
        io->a += io->crop_left;
      }
      io->mb_y = y_start - io->crop_top;
      io->mb_w = io->crop_right - io->crop_left;
      io->mb_h = y_end - y_start;
      ok = io->put(io);
    }
  }
  // rotate top samples if needed
  if (cache_id + 1 == dec->num_caches_) {
    if (!is_last_row) {
      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
    }
  }
  return ok;
 }
 #undef MACROBLOCK_VPOS
 //------------------------------------------------------------------------------
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  VP8ThreadContext* const ctx = &dec->thread_ctx_;
  const int filter_row =
      (dec->filter_type_ > 0) &&
      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
  // ctx->id_ and ctx->f_info_ are already set
  ctx->mb_y_ = dec->mb_y_;
  ctx->filter_row_ = filter_row;
  ReconstructRow(dec, ctx);
  ok = FinishRow(dec, io);
  return ok;
 }
 //------------------------------------------------------------------------------
 // Finish setting up the decoding parameter once user's setup() is called.
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
  // Call setup() first. This may trigger additional decoding features on 'io'.
  // Note: Afterward, we must call teardown() no matter what.
  if (io->setup != NULL && !io->setup(io)) {
    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
    return dec->status_;
  }
  // Disable filtering per user request
  if (io->bypass_filtering) {
    dec->filter_type_ = 0;
  }
  // TODO(skal): filter type / strength / sharpness forcing
  // Define the area where we can skip in-loop filtering, in case of cropping.
  //
  // 'Simple' filter reads two luma samples outside of the macroblock
  // and filters one. It doesn't filter the chroma samples. Hence, we can
  // avoid doing the in-loop filtering before crop_top/crop_left position.
  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
  // Means: there's a dependency chain that goes all the way up to the
  // top-left corner of the picture (MB #0). We must filter all the previous
  // macroblocks.
  // TODO(skal): add an 'approximate_decoding' option, that won't produce
  // a 1:1 bit-exactness for complex filtering?
  {
    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
    if (dec->filter_type_ == 2) {
      // For complex filter, we need to preserve the dependency chain.
      dec->tl_mb_x_ = 0;
      dec->tl_mb_y_ = 0;
    } else {
      // For simple filter, we can filter only the cropped region.
      // We include 'extra_pixels' on the other side of the boundary, since
      // vertical or horizontal filtering of the previous macroblock can
      // modify some abutting pixels.
      dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
      dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
      if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
      if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
    }
    // We need some 'extra' pixels on the right/bottom.
    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
    if (dec->br_mb_x_ > dec->mb_w_) {
      dec->br_mb_x_ = dec->mb_w_;
    }
    if (dec->br_mb_y_ > dec->mb_h_) {
      dec->br_mb_y_ = dec->mb_h_;
    }
  }
  PrecomputeFilterStrengths(dec);
  return VP8_STATUS_OK;
 }
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 1;
  if (io->teardown != NULL) {
    io->teardown(io);
  }
  return ok;
 }
 #define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
  dec->cache_id_ = 0;
  dec->num_caches_ = ST_CACHE_LINES;
  return 1;
 }
 #undef ST_CACHE_LINES
 //------------------------------------------------------------------------------
 // Memory setup
 static int AllocateMemory(VP8Decoder* const dec) {
  const int num_caches = dec->num_caches_;
  const int mb_w = dec->mb_w_;
  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
  const size_t f_info_size = (dec->filter_type_ > 0) ? mb_w * sizeof(VP8FInfo) : 0;
  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
  const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_);
  const size_t cache_height = (16 * num_caches
                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
  const size_t cache_size = top_size * cache_height;
  // alpha_size is the only one that scales as width x height.
  const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
  const uint64_t needed = (uint64_t)intra_pred_mode_size
                        + top_size + mb_info_size + f_info_size
                        + yuv_size + mb_data_size
                        + cache_size + alpha_size + ALIGN_CST;
  uint8_t* mem;
  if (needed != (size_t)needed) return 0;  // check for overflow
  if (needed > dec->mem_size_) {
    free(dec->mem_);
    dec->mem_size_ = 0;
    dec->mem_ = malloc(needed * sizeof(uint8_t));
    if (dec->mem_ == NULL) {
      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                         "no memory during frame initialization.");
    }
    // down-cast is ok, thanks to WebPSafeAlloc() above.
    dec->mem_size_ = (size_t)needed;
  }
  mem = (uint8_t*)dec->mem_;
  dec->intra_t_ = (uint8_t*)mem;
  mem += intra_pred_mode_size;
  dec->yuv_t_ = (VP8TopSamples*)mem;
  mem += top_size;
  dec->mb_info_ = ((VP8MB*)mem) + 1;
  mem += mb_info_size;
  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
  mem += f_info_size;
  dec->thread_ctx_.id_ = 0;
  dec->thread_ctx_.f_info_ = dec->f_info_;
  mem = (uint8_t*)DO_ALIGN(mem);
  assert((yuv_size & ALIGN_CST) == 0);
  dec->yuv_b_ = (uint8_t*)mem;
  mem += yuv_size;
  dec->mb_data_ = (VP8MBData*)mem;
  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
  mem += mb_data_size;
  dec->cache_y_stride_ = 16 * mb_w;
  dec->cache_uv_stride_ = 8 * mb_w;
  {
    const int extra_rows = kFilterExtraRows[dec->filter_type_];
    const int extra_y = extra_rows * dec->cache_y_stride_;
    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
    dec->cache_u_ = dec->cache_y_
                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
    dec->cache_v_ = dec->cache_u_
                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
    dec->cache_id_ = 0;
  }
  mem += cache_size;
  // alpha plane
  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
  mem += alpha_size;
  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
  // note: left/top-info is initialized once for all.
  memset(dec->mb_info_ - 1, 0, mb_info_size);
  VP8InitScanline(dec);   // initialize left too.
  // initialize top
  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
  return 1;
 }
 static void InitIo(VP8Decoder* const dec, VP8Io* io) {
  // prepare 'io'
  io->mb_y = 0;
  io->y = dec->cache_y_;
  io->u = dec->cache_u_;
  io->v = dec->cache_v_;
  io->y_stride = dec->cache_y_stride_;
  io->uv_stride = dec->cache_uv_stride_;
  io->a = NULL;
 }
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
  if (!AllocateMemory(dec)) return 0;
  InitIo(dec, io);
  VP8DspInit();  // Init critical function pointers and look-up tables.
  return 1;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dec/io.cpp
+++ b/src/loaders/webp/dec/io.cpp
@ -0,0 +1,392 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // functions for sample output.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include <stdlib.h>
 #include "../dec/vp8i.h"
 #include "./webpi.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 #include "../utils/utils.h"
 // Point-sampling U/V sampler.
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
  WebPDecBuffer* const output = p->output;
  WebPRGBABuffer* const buf = &output->u.RGBA;
  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
  WebPSamplerProcessPlane(io->y, io->y_stride,
                          io->u, io->v, io->uv_stride,
                          dst, buf->stride, io->mb_w, io->mb_h,
                          WebPSamplers[output->colorspace]);
  return io->mb_h;
 }
 //------------------------------------------------------------------------------
 // Fancy upsampling
 #ifdef FANCY_UPSAMPLING
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  int num_lines_out = io->mb_h;   // a priori guess
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
  const uint8_t* cur_y = io->y;
  const uint8_t* cur_u = io->u;
  const uint8_t* cur_v = io->v;
  const uint8_t* top_u = p->tmp_u;
  const uint8_t* top_v = p->tmp_v;
  int y = io->mb_y;
  const int y_end = io->mb_y + io->mb_h;
  const int mb_w = io->mb_w;
  const int uv_w = (mb_w + 1) / 2;
  if (y == 0) {
    // First line is special cased. We mirror the u/v samples at boundary.
    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
  } else {
    // We can finish the left-over line from previous call.
    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
             dst - buf->stride, dst, mb_w);
    ++num_lines_out;
  }
  // Loop over each output pairs of row.
  for (; y + 2 < y_end; y += 2) {
    top_u = cur_u;
    top_v = cur_v;
    cur_u += io->uv_stride;
    cur_v += io->uv_stride;
    dst += 2 * buf->stride;
    cur_y += 2 * io->y_stride;
    upsample(cur_y - io->y_stride, cur_y,
             top_u, top_v, cur_u, cur_v,
             dst - buf->stride, dst, mb_w);
  }
  // move to last row
  cur_y += io->y_stride;
  if (io->crop_top + y_end < io->crop_bottom) {
    // Save the unfinished samples for next call (as we're not done yet).
    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
    // The fancy upsampler leaves a row unfinished behind
    // (except for the very last row)
    num_lines_out--;
  } else {
    // Process the very last row of even-sized picture
    if (!(y_end & 1)) {
      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
               dst + buf->stride, NULL, mb_w);
    }
  }
  return num_lines_out;
 }
 #endif    /* FANCY_UPSAMPLING */
 //------------------------------------------------------------------------------
 static int GetAlphaSourceRow(const VP8Io* const io,
                             const uint8_t** alpha, int* const num_rows) {
  int start_y = io->mb_y;
  *num_rows = io->mb_h;
  // Compensate for the 1-line delay of the fancy upscaler.
  // This is similar to EmitFancyRGB().
  if (io->fancy_upsampling) {
    if (start_y == 0) {
      // We don't process the last row yet. It'll be done during the next call.
      --*num_rows;
    } else {
      --start_y;
      // Fortunately, *alpha data is persistent, so we can go back
      // one row and finish alpha blending, now that the fancy upscaler
      // completed the YUV->RGB interpolation.
      *alpha -= io->width;
    }
    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
      // If it's the very last call, we process all the remaining rows!
      *num_rows = io->crop_bottom - io->crop_top - start_y;
    }
  }
  return start_y;
 }
 static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
  const uint8_t* alpha = io->a;
  if (alpha != NULL) {
    const int mb_w = io->mb_w;
    const WEBP_CSP_MODE colorspace = p->output->colorspace;
    const int alpha_first =
        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
    int num_rows;
    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
                                            num_rows, dst, buf->stride);
    // has_alpha is true if there's non-trivial alpha to premultiply with.
    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
      WebPApplyAlphaMultiply(base_rgba, alpha_first,
                             mb_w, num_rows, buf->stride);
    }
  }
  return 0;
 }
 //------------------------------------------------------------------------------
 // RGBA rescaling
 static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
         WebPRescalerHasPendingOutput(&p->scaler_u)) {
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
    WebPRescalerExportRow(&p->scaler_y, 0);
    WebPRescalerExportRow(&p->scaler_u, 0);
    WebPRescalerExportRow(&p->scaler_v, 0);
    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
            dst, p->scaler_y.dst_width);
    dst += buf->stride;
    ++num_lines_out;
  }
  return num_lines_out;
 }
 static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
  const int mb_h = io->mb_h;
  const int uv_mb_h = (mb_h + 1) >> 1;
  int j = 0, uv_j = 0;
  int num_lines_out = 0;
  while (j < mb_h) {
    const int y_lines_in =
        WebPRescalerImport(&p->scaler_y, mb_h - j,
                           io->y + j * io->y_stride, io->y_stride);
    const int u_lines_in =
        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
                           io->u + uv_j * io->uv_stride, io->uv_stride);
    const int v_lines_in =
        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
                           io->v + uv_j * io->uv_stride, io->uv_stride);
    (void)v_lines_in;   // remove a gcc warning
    assert(u_lines_in == v_lines_in);
    j += y_lines_in;
    uv_j += u_lines_in;
    num_lines_out += ExportRGB(p, num_lines_out);
  }
  return num_lines_out;
 }
 static int ExportAlpha(WebPDecParams* const p, int y_pos) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int alpha_first =
      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
  int num_lines_out = 0;
  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
  uint32_t alpha_mask = 0xff;
  const int width = p->scaler_a.dst_width;
  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
    int i;
    assert(p->last_y + y_pos + num_lines_out < p->output->height);
    WebPRescalerExportRow(&p->scaler_a, 0);
    for (i = 0; i < width; ++i) {
      const uint32_t alpha_value = p->scaler_a.dst[i];
      dst[4 * i] = alpha_value;
      alpha_mask &= alpha_value;
    }
    dst += buf->stride;
    ++num_lines_out;
  }
  if (is_premult_alpha && alpha_mask != 0xff) {
    WebPApplyAlphaMultiply(base_rgba, alpha_first,
                           width, num_lines_out, buf->stride);
  }
  return num_lines_out;
 }
 static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
  if (io->a != NULL) {
    WebPRescaler* const scaler = &p->scaler_a;
    int j = 0;
    int pos = 0;
    while (j < io->mb_h) {
      j += WebPRescalerImport(scaler, io->mb_h - j,
                              io->a + j * io->width, io->width);
      pos += p->emit_alpha_row(p, pos);
    }
  }
  return 0;
 }
 static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
  const int out_width  = io->scaled_width;
  const int out_height = io->scaled_height;
  const int uv_in_width  = (io->mb_w + 1) >> 1;
  const int uv_in_height = (io->mb_h + 1) >> 1;
  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
  int32_t* work;  // rescalers work area
  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
  size_t tmp_size1, tmp_size2, total_size;
  tmp_size1 = 3 * work_size;
  tmp_size2 = 3 * out_width;
  if (has_alpha) {
    tmp_size1 += work_size;
    tmp_size2 += out_width;
  }
  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
  p->memory = calloc(1ULL, total_size);
  if (p->memory == NULL) {
    return 0;   // memory error
  }
  work = (int32_t*)p->memory;
  tmp = (uint8_t*)(work + tmp_size1);
  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
                   tmp + 0 * out_width, out_width, out_height, 0, 1,
                   io->mb_w, out_width, io->mb_h, out_height,
                   work + 0 * work_size);
  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
                   tmp + 1 * out_width, out_width, out_height, 0, 1,
                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 1 * work_size);
  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
                   tmp + 2 * out_width, out_width, out_height, 0, 1,
                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
                   work + 2 * work_size);
  p->emit = EmitRescaledRGB;
  WebPInitYUV444Converters();
  if (has_alpha) {
    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
                     tmp + 3 * out_width, out_width, out_height, 0, 1,
                     io->mb_w, out_width, io->mb_h, out_height,
                     work + 3 * work_size);
    p->emit_alpha = EmitRescaledAlphaRGB;
    p->emit_alpha_row = ExportAlpha;
    WebPInitAlphaProcessing();
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 // Default custom functions
 static int CustomSetup(VP8Io* io) {
  WebPDecParams* const p = (WebPDecParams*)io->opaque;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int is_rgb = WebPIsRGBMode(colorspace);
  const int is_alpha = WebPIsAlphaMode(colorspace);
  p->memory = NULL;
  p->emit = NULL;
  p->emit_alpha = NULL;
  p->emit_alpha_row = NULL;
  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
    return 0;
  }
  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
    WebPInitUpsamplers();
  }
  if (io->use_scaling) {
    const int ok = InitRGBRescaler(io, p);
    if (!ok) {
      return 0;    // memory error
    }
  } else {
    if (is_rgb) {
      p->emit = EmitSampledRGB;   // default
      if (io->fancy_upsampling) {
 #ifdef FANCY_UPSAMPLING
        const int uv_width = (io->mb_w + 1) >> 1;
        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
        if (p->memory == NULL) {
          return 0;   // memory error.
        }
        p->tmp_y = (uint8_t*)p->memory;
        p->tmp_u = p->tmp_y + io->mb_w;
        p->tmp_v = p->tmp_u + uv_width;
        p->emit = EmitFancyRGB;
        WebPInitUpsamplers();
 #endif
      } else {
        WebPInitSamplers();
      }
    }
    if (is_alpha) {  // need transparency output
      p->emit_alpha = EmitAlphaRGB;
      if (is_rgb) WebPInitAlphaProcessing();
    }
  }
  if (is_rgb) {
    VP8YUVInit();
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 static int CustomPut(const VP8Io* io) {
  WebPDecParams* const p = (WebPDecParams*)io->opaque;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  int num_lines_out;
  assert(!(io->mb_y & 1));
  if (mb_w <= 0 || mb_h <= 0) {
    return 0;
  }
  num_lines_out = p->emit(io, p);
  if (p->emit_alpha != NULL) {
    p->emit_alpha(io, p);
  }
  p->last_y += num_lines_out;
  return 1;
 }
 //------------------------------------------------------------------------------
 static void CustomTeardown(const VP8Io* io) {
  WebPDecParams* const p = (WebPDecParams*)io->opaque;
  free(p->memory);
  p->memory = NULL;
 }
 //------------------------------------------------------------------------------
 // Main entry point
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
  io->put      = CustomPut;
  io->setup    = CustomSetup;
  io->teardown = CustomTeardown;
  io->opaque   = params;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dec/meson.build
+++ b/src/loaders/webp/dec/meson.build
@ -0,0 +1,22 @@
 source_file = [
   'alphai.h',
   'common.h',
   'decode_vp8.h',
   'vp8i.h',
   'vp8li.h',
   'webpi.h',
   'alpha.cpp',
   'buffer.cpp',
   'frame.cpp',   
   'io.cpp',
   'quant.cpp',
   'tree.cpp',
   'vp8.cpp',
   'vp8l.cpp',
   'webp.cpp'
 ]
 webp_deb += [declare_dependency(
   include_directories : include_directories('.'),
   sources : source_file
 )]
--- a/src/loaders/webp/dec/quant.cpp
+++ b/src/loaders/webp/dec/quant.cpp
@ -0,0 +1,110 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Quantizer initialization
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./vp8i.h"
 static WEBP_INLINE int clip(int v, int M) {
  return v < 0 ? 0 : v > M ? M : v;
 }
 // Paragraph 14.1
 static const uint8_t kDcTable[128] = {
  4,     5,   6,   7,   8,   9,  10,  10,
  11,   12,  13,  14,  15,  16,  17,  17,
  18,   19,  20,  20,  21,  21,  22,  22,
  23,   23,  24,  25,  25,  26,  27,  28,
  29,   30,  31,  32,  33,  34,  35,  36,
  37,   37,  38,  39,  40,  41,  42,  43,
  44,   45,  46,  46,  47,  48,  49,  50,
  51,   52,  53,  54,  55,  56,  57,  58,
  59,   60,  61,  62,  63,  64,  65,  66,
  67,   68,  69,  70,  71,  72,  73,  74,
  75,   76,  76,  77,  78,  79,  80,  81,
  82,   83,  84,  85,  86,  87,  88,  89,
  91,   93,  95,  96,  98, 100, 101, 102,
  104, 106, 108, 110, 112, 114, 116, 118,
  122, 124, 126, 128, 130, 132, 134, 136,
  138, 140, 143, 145, 148, 151, 154, 157
 };
 static const uint16_t kAcTable[128] = {
  4,     5,   6,   7,   8,   9,  10,  11,
  12,   13,  14,  15,  16,  17,  18,  19,
  20,   21,  22,  23,  24,  25,  26,  27,
  28,   29,  30,  31,  32,  33,  34,  35,
  36,   37,  38,  39,  40,  41,  42,  43,
  44,   45,  46,  47,  48,  49,  50,  51,
  52,   53,  54,  55,  56,  57,  58,  60,
  62,   64,  66,  68,  70,  72,  74,  76,
  78,   80,  82,  84,  86,  88,  90,  92,
  94,   96,  98, 100, 102, 104, 106, 108,
  110, 112, 114, 116, 119, 122, 125, 128,
  131, 134, 137, 140, 143, 146, 149, 152,
  155, 158, 161, 164, 167, 170, 173, 177,
  181, 185, 189, 193, 197, 201, 205, 209,
  213, 217, 221, 225, 229, 234, 239, 245,
  249, 254, 259, 264, 269, 274, 279, 284
 };
 //------------------------------------------------------------------------------
 // Paragraph 9.6
 void VP8ParseQuant(VP8Decoder* const dec) {
  VP8BitReader* const br = &dec->br_;
  const int base_q0 = VP8GetValue(br, 7);
  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
  const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
  int i;
  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    int q;
    if (hdr->use_segment_) {
      q = hdr->quantizer_[i];
      if (!hdr->absolute_delta_) {
        q += base_q0;
      }
    } else {
      if (i > 0) {
        dec->dqm_[i] = dec->dqm_[0];
        continue;
      } else {
        q = base_q0;
      }
    }
    {
      VP8QuantMatrix* const m = &dec->dqm_[i];
      m->y1_mat_[0] = kDcTable[clip(q + dqy1_dc, 127)];
      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];
      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
      // word size.
      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
    }
  }
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dec/tree.cpp
+++ b/src/loaders/webp/dec/tree.cpp
@ -0,0 +1,525 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Coding trees and probas
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./vp8i.h"
 #include "../utils/bit_reader_inl.h"
 #define USE_GENERIC_TREE
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
  -B_DC_PRED, 1,
    -B_TM_PRED, 2,
      -B_VE_PRED, 3,
        4, 6,
          -B_HE_PRED, 5,
            -B_RD_PRED, -B_VR_PRED,
        -B_LD_PRED, 7,
          -B_VL_PRED, 8,
            -B_HD_PRED, -B_HU_PRED
 };
 #endif
 //------------------------------------------------------------------------------
 // Default probabilities
 // Paragraph 13.5
 static const uint8_t
  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
    },
    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
    },
    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
    },
    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
    },
    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
    },
    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
    },
    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
    },
    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
    }
  },
  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
    },
    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
    },
    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
    },
    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
    },
    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
    },
    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
    },
    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
    },
    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
    }
  },
  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
    },
    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
    },
    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
    },
    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
    },
    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
    },
    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
    },
    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
    },
    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
    }
  },
  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
    },
    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
    },
    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
    },
    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
    },
    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
    },
    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
    },
    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
    },
    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
    }
  }
 };
 // Paragraph 11.5
 static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
 };
 void VP8ResetProba(VP8Proba* const proba) {
  memset(proba->segments_, 255u, sizeof(proba->segments_));
  // proba->bands_[][] is initialized later
 }
 static void ParseIntraMode(VP8BitReader* const br,
                           VP8Decoder* const dec, int mb_x) {
  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
  uint8_t* const left = dec->intra_l_;
  VP8MBData* const block = dec->mb_data_ + mb_x;
  // Note: we don't save segment map (yet), as we don't expect
  // to decode more than 1 keyframe.
  if (dec->segment_hdr_.update_map_) {
    // Hardcoded tree parsing
    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
                    ? VP8GetBit(br, dec->proba_.segments_[1])
                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
  } else {
    block->segment_ = 0;  // default for intra
  }
  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
  if (!block->is_i4x4_) {
    // Hardcoded 16x16 intra-mode decision tree.
    const int ymode =
        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
    block->imodes_[0] = ymode;
    memset(top, ymode, 4 * sizeof(*top));
    memset(left, ymode, 4 * sizeof(*left));
  } else {
    uint8_t* modes = block->imodes_;
    int y;
    for (y = 0; y < 4; ++y) {
      int ymode = left[y];
      int x;
      for (x = 0; x < 4; ++x) {
        const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
        // Generic tree-parsing
        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
        while (i > 0) {
          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
        }
        ymode = -i;
 #else
        // Hardcoded tree parsing
        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
                      !VP8GetBit(br, prob[3]) ?
                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
        top[x] = ymode;
      }
      memcpy(modes, top, 4 * sizeof(*top));
      modes += 4;
      left[y] = ymode;
    }
  }
  // Hardcoded UVMode decision tree
  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
                 : !VP8GetBit(br, 114) ? V_PRED
                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
  int mb_x;
  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
    ParseIntraMode(br, dec, mb_x);
  }
  return !dec->br_.eof_;
 }
 //------------------------------------------------------------------------------
 // Paragraph 13
 static const uint8_t
    CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    }
  },
  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
    },
    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    }
  },
  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
    },
    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    }
  },
  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    },
    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
    }
  }
 };
 // Paragraph 9.9
 static const int kBands[16 + 1] = {
  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
  0  // extra entry as sentinel
 };
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
  VP8Proba* const proba = &dec->proba_;
  int t, b, c, p;
  for (t = 0; t < NUM_TYPES; ++t) {
    for (b = 0; b < NUM_BANDS; ++b) {
      for (c = 0; c < NUM_CTX; ++c) {
        for (p = 0; p < NUM_PROBAS; ++p) {
          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
          proba->bands_[t][b].probas_[c][p] = v;
        }
      }
    }
    for (b = 0; b < 16 + 1; ++b) {
      proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
    }
  }
  dec->use_skip_proba_ = VP8Get(br);
  if (dec->use_skip_proba_) {
    dec->skip_p_ = VP8GetValue(br, 8);
  }
 }
--- a/src/loaders/webp/dec/vp8.cpp
+++ b/src/loaders/webp/dec/vp8.cpp
@ -0,0 +1,681 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // main entry for the decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 #include "../utils/bit_reader_inl.h"
 #include "../utils/utils.h"
 //------------------------------------------------------------------------------
 int WebPGetDecoderVersion(void) {
  return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
 }
 //------------------------------------------------------------------------------
 // VP8Decoder
 static void SetOk(VP8Decoder* const dec) {
  dec->status_ = VP8_STATUS_OK;
  dec->error_msg_ = "OK";
 }
 int VP8InitIoInternal(VP8Io* const io, int version) {
  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
    return 0;  // mismatch error
  }
  if (io != NULL) {
    memset(io, 0, sizeof(*io));
  }
  return 1;
 }
 VP8Decoder* VP8New(void) {
  VP8Decoder* const dec = (VP8Decoder*)calloc(1ULL, sizeof(*dec));
  if (dec != NULL) {
    SetOk(dec);
    dec->ready_ = 0;
    dec->num_parts_ = 1;
  }
  return dec;
 }
 VP8StatusCode VP8Status(VP8Decoder* const dec) {
  if (!dec) return VP8_STATUS_INVALID_PARAM;
  return dec->status_;
 }
 const char* VP8StatusMessage(VP8Decoder* const dec) {
  if (dec == NULL) return "no object";
  if (!dec->error_msg_) return "OK";
  return dec->error_msg_;
 }
 void VP8Delete(VP8Decoder* const dec) {
  if (dec != NULL) {
    VP8Clear(dec);
    free(dec);
  }
 }
 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char* const msg) {
  // The oldest error reported takes precedence over the new one.
  if (dec->status_ == VP8_STATUS_OK) {
    dec->status_ = error;
    dec->error_msg_ = msg;
    dec->ready_ = 0;
  }
  return 0;
 }
 //------------------------------------------------------------------------------
 int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
  return (data_size >= 3 &&
          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
 }
 int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
               int* const width, int* const height) {
  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
    return 0;         // not enough data
  }
  // check signature
  if (!VP8CheckSignature(data + 3, data_size - 3)) {
    return 0;         // Wrong signature.
  } else {
    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
    const int key_frame = !(bits & 1);
    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
    if (!key_frame) {   // Not a keyframe.
      return 0;
    }
    if (((bits >> 1) & 7) > 3) {
      return 0;         // unknown profile
    }
    if (!((bits >> 4) & 1)) {
      return 0;         // first frame is invisible!
    }
    if (((bits >> 5)) >= chunk_size) {  // partition_length
      return 0;         // inconsistent size information.
    }
    if (w == 0 || h == 0) {
      return 0;         // We don't support both width and height to be zero.
    }
    if (width) {
      *width = w;
    }
    if (height) {
      *height = h;
    }
    return 1;
  }
 }
 //------------------------------------------------------------------------------
 // Header parsing
 static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
  assert(hdr != NULL);
  hdr->use_segment_ = 0;
  hdr->update_map_ = 0;
  hdr->absolute_delta_ = 1;
  memset(hdr->quantizer_, 0, sizeof(hdr->quantizer_));
  memset(hdr->filter_strength_, 0, sizeof(hdr->filter_strength_));
 }
 // Paragraph 9.3
 static int ParseSegmentHeader(VP8BitReader* br,
                              VP8SegmentHeader* hdr, VP8Proba* proba) {
  assert(br != NULL);
  assert(hdr != NULL);
  hdr->use_segment_ = VP8Get(br);
  if (hdr->use_segment_) {
    hdr->update_map_ = VP8Get(br);
    if (VP8Get(br)) {   // update data
      int s;
      hdr->absolute_delta_ = VP8Get(br);
      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
      }
      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
      }
    }
    if (hdr->update_map_) {
      int s;
      for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
      }
    }
  } else {
    hdr->update_map_ = 0;
  }
  return !br->eof_;
 }
 // Paragraph 9.5
 // This function returns VP8_STATUS_SUSPENDED if we don't have all the
 // necessary data in 'buf'.
 // This case is not necessarily an error (for incremental decoding).
 // Still, no bitreader is ever initialized to make it possible to read
 // unavailable memory.
 // If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
 // is returned, and this is an unrecoverable error.
 // If the partitions were positioned ok, VP8_STATUS_OK is returned.
 static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
                                     const uint8_t* buf, size_t size) {
  VP8BitReader* const br = &dec->br_;
  const uint8_t* sz = buf;
  const uint8_t* buf_end = buf + size;
  const uint8_t* part_start;
  int last_part;
  int p;
  dec->num_parts_ = 1 << VP8GetValue(br, 2);
  last_part = dec->num_parts_ - 1;
  part_start = buf + last_part * 3;
  if (buf_end < part_start) {
    // we can't even read the sizes with sz[]! That's a failure.
    return VP8_STATUS_NOT_ENOUGH_DATA;
  }
  for (p = 0; p < last_part; ++p) {
    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
    const uint8_t* part_end = part_start + psize;
    if (part_end > buf_end) part_end = buf_end;
    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
    part_start = part_end;
    sz += 3;
  }
  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
  return (part_start < buf_end) ? VP8_STATUS_OK :
           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
 }
 // Paragraph 9.4
 static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
  VP8FilterHeader* const hdr = &dec->filter_hdr_;
  hdr->simple_    = VP8Get(br);
  hdr->level_     = VP8GetValue(br, 6);
  hdr->sharpness_ = VP8GetValue(br, 3);
  hdr->use_lf_delta_ = VP8Get(br);
  if (hdr->use_lf_delta_) {
    if (VP8Get(br)) {   // update lf-delta?
      int i;
      for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
        if (VP8Get(br)) {
          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
        }
      }
      for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
        if (VP8Get(br)) {
          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
        }
      }
    }
  }
  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
  return !br->eof_;
 }
 // Topmost call
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
  const uint8_t* buf;
  size_t buf_size;
  VP8FrameHeader* frm_hdr;
  VP8PictureHeader* pic_hdr;
  VP8BitReader* br;
  VP8StatusCode status;
  if (dec == NULL) {
    return 0;
  }
  SetOk(dec);
  if (io == NULL) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "null VP8Io passed to VP8GetHeaders()");
  }
  buf = io->data;
  buf_size = io->data_size;
  if (buf_size < 4) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "Truncated header.");
  }
  // Paragraph 9.1
  {
    const uint32_t bits = buf[0] | (buf[1] << 8) | (buf[2] << 16);
    frm_hdr = &dec->frm_hdr_;
    frm_hdr->key_frame_ = !(bits & 1);
    frm_hdr->profile_ = (bits >> 1) & 7;
    frm_hdr->show_ = (bits >> 4) & 1;
    frm_hdr->partition_length_ = (bits >> 5);
    if (frm_hdr->profile_ > 3)
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "Incorrect keyframe parameters.");
    if (!frm_hdr->show_)
      return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                         "Frame not displayable.");
    buf += 3;
    buf_size -= 3;
  }
  pic_hdr = &dec->pic_hdr_;
  if (frm_hdr->key_frame_) {
    // Paragraph 9.2
    if (buf_size < 7) {
      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "cannot parse picture header");
    }
    if (!VP8CheckSignature(buf, buf_size)) {
      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                         "Bad code word");
    }
    pic_hdr->width_ = ((buf[4] << 8) | buf[3]) & 0x3fff;
    pic_hdr->xscale_ = buf[4] >> 6;   // ratio: 1, 5/4 5/3 or 2
    pic_hdr->height_ = ((buf[6] << 8) | buf[5]) & 0x3fff;
    pic_hdr->yscale_ = buf[6] >> 6;
    buf += 7;
    buf_size -= 7;
    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
    // Setup default output area (can be later modified during io->setup())
    io->width = pic_hdr->width_;
    io->height = pic_hdr->height_;
    io->use_scaling  = 0;
    io->use_cropping = 0;
    io->crop_top  = 0;
    io->crop_left = 0;
    io->crop_right  = io->width;
    io->crop_bottom = io->height;
    io->mb_w = io->width;   // sanity check
    io->mb_h = io->height;  // ditto
    VP8ResetProba(&dec->proba_);
    ResetSegmentHeader(&dec->segment_hdr_);
  }
  // Check if we have all the partition #0 available, and initialize dec->br_
  // to read this partition (and this partition only).
  if (frm_hdr->partition_length_ > buf_size) {
    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                       "bad partition length");
  }
  br = &dec->br_;
  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
  buf += frm_hdr->partition_length_;
  buf_size -= frm_hdr->partition_length_;
  if (frm_hdr->key_frame_) {
    pic_hdr->colorspace_ = VP8Get(br);
    pic_hdr->clamp_type_ = VP8Get(br);
  }
  if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                       "cannot parse segment header");
  }
  // Filter specs
  if (!ParseFilterHeader(br, dec)) {
    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                       "cannot parse filter header");
  }
  status = ParsePartitions(dec, buf, buf_size);
  if (status != VP8_STATUS_OK) {
    return VP8SetError(dec, status, "cannot parse partitions");
  }
  // quantizer change
  VP8ParseQuant(dec);
  // Frame buffer marking
  if (!frm_hdr->key_frame_) {
    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                       "Not a key frame.");
  }
  VP8Get(br);   // ignore the value of update_proba_
  VP8ParseProba(br, dec);
  // sanitized state
  dec->ready_ = 1;
  return 1;
 }
 //------------------------------------------------------------------------------
 // Residual decoding (Paragraph 13.2 / 13.3)
 static const uint8_t kCat3[] = { 173, 148, 140, 0 };
 static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
 static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
 static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 // See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
  int v;
  if (!VP8GetBit(br, p[3])) {
    if (!VP8GetBit(br, p[4])) {
      v = 2;
    } else {
      v = 3 + VP8GetBit(br, p[5]);
    }
  } else {
    if (!VP8GetBit(br, p[6])) {
      if (!VP8GetBit(br, p[7])) {
        v = 5 + VP8GetBit(br, 159);
      } else {
        v = 7 + 2 * VP8GetBit(br, 165);
        v += VP8GetBit(br, 145);
      }
    } else {
      const uint8_t* tab;
      const int bit1 = VP8GetBit(br, p[8]);
      const int bit0 = VP8GetBit(br, p[9 + bit1]);
      const int cat = 2 * bit1 + bit0;
      v = 0;
      for (tab = kCat3456[cat]; *tab; ++tab) {
        v += v + VP8GetBit(br, *tab);
      }
      v += 3 + (8 << cat);
    }
  }
  return v;
 }
 // Returns the position of the last non-zero coeff plus one
 static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
                     int ctx, const quant_t dq, int n, int16_t* out) {
  const uint8_t* p = prob[n]->probas_[ctx];
  for (; n < 16; ++n) {
    if (!VP8GetBit(br, p[0])) {
      return n;  // previous coeff was last non-zero coeff
    }
    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
      p = prob[++n]->probas_[0];
      if (n == 16) return 16;
    }
    {        // non zero coeff
      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
      int v;
      if (!VP8GetBit(br, p[2])) {
        v = 1;
        p = p_ctx[1];
      } else {
        v = GetLargeValue(br, p);
        p = p_ctx[2];
      }
      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
    }
  }
  return 16;
 }
 static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
  nz_coeffs <<= 2;
  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
  return nz_coeffs;
 }
 static void TransformWHT(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
    const int a0 = in[0 + i] + in[12 + i];
    const int a1 = in[4 + i] + in[ 8 + i];
    const int a2 = in[4 + i] - in[ 8 + i];
    const int a3 = in[0 + i] - in[12 + i];
    tmp[0  + i] = a0 + a1;
    tmp[8  + i] = a0 - a1;
    tmp[4  + i] = a3 + a2;
    tmp[12 + i] = a3 - a2;
  }
  for (i = 0; i < 4; ++i) {
    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
    const int a0 = dc             + tmp[3 + i * 4];
    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
    const int a3 = dc             - tmp[3 + i * 4];
    out[ 0] = (a0 + a1) >> 3;
    out[16] = (a3 + a2) >> 3;
    out[32] = (a0 - a1) >> 3;
    out[48] = (a3 - a2) >> 3;
    out += 64;
  }
 }
 static int ParseResiduals(VP8Decoder* const dec,
                          VP8MB* const mb, VP8BitReader* const token_br) {
  const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
  const VP8BandProbas* const * ac_proba;
  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
  int16_t* dst = block->coeffs_;
  VP8MB* const left_mb = dec->mb_info_ - 1;
  uint8_t tnz, lnz;
  uint32_t non_zero_y = 0;
  uint32_t non_zero_uv = 0;
  int x, y, ch;
  uint32_t out_t_nz, out_l_nz;
  int first;
  memset(dst, 0, 384 * sizeof(*dst));
  if (!block->is_i4x4_) {    // parse DC
    int16_t dc[16] = { 0 };
    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
    if (nz > 1) {   // more than just the DC -> perform the full transform
      TransformWHT(dc, dst);
    } else {        // only DC is non-zero -> inlined simplified transform
      int i;
      const int dc0 = (dc[0] + 3) >> 3;
      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
    }
    first = 1;
    ac_proba = bands[0];
  } else {
    first = 0;
    ac_proba = bands[3];
  }
  tnz = mb->nz_ & 0x0f;
  lnz = left_mb->nz_ & 0x0f;
  for (y = 0; y < 4; ++y) {
    int l = lnz & 1;
    uint32_t nz_coeffs = 0;
    for (x = 0; x < 4; ++x) {
      const int ctx = l + (tnz & 1);
      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
      l = (nz > first);
      tnz = (tnz >> 1) | (l << 7);
      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
      dst += 16;
    }
    tnz >>= 4;
    lnz = (lnz >> 1) | (l << 7);
    non_zero_y = (non_zero_y << 8) | nz_coeffs;
  }
  out_t_nz = tnz;
  out_l_nz = lnz >> 4;
  for (ch = 0; ch < 4; ch += 2) {
    uint32_t nz_coeffs = 0;
    tnz = mb->nz_ >> (4 + ch);
    lnz = left_mb->nz_ >> (4 + ch);
    for (y = 0; y < 2; ++y) {
      int l = lnz & 1;
      for (x = 0; x < 2; ++x) {
        const int ctx = l + (tnz & 1);
        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
        l = (nz > 0);
        tnz = (tnz >> 1) | (l << 3);
        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
        dst += 16;
      }
      tnz >>= 2;
      lnz = (lnz >> 1) | (l << 5);
    }
    // Note: we don't really need the per-4x4 details for U/V blocks.
    non_zero_uv |= nz_coeffs << (4 * ch);
    out_t_nz |= (tnz << 4) << ch;
    out_l_nz |= (lnz & 0xf0) << ch;
  }
  mb->nz_ = out_t_nz;
  left_mb->nz_ = out_l_nz;
  block->non_zero_y_ = non_zero_y;
  block->non_zero_uv_ = non_zero_uv;
  // We look at the mode-code of each block and check if some blocks have less
  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
  // empty blocks.
  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
 //------------------------------------------------------------------------------
 // Main loop
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
  VP8MB* const left = dec->mb_info_ - 1;
  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
  int skip = dec->use_skip_proba_ ? block->skip_ : 0;
  if (!skip) {
    skip = ParseResiduals(dec, mb, token_br);
  } else {
    left->nz_ = mb->nz_ = 0;
    if (!block->is_i4x4_) {
      left->nz_dc_ = mb->nz_dc_ = 0;
    }
    block->non_zero_y_ = 0;
    block->non_zero_uv_ = 0;
    block->dither_ = 0;
  }
  if (dec->filter_type_ > 0) {  // store filter info
    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
    finfo->f_inner_ |= !skip;
  }
  return !token_br->eof_;
 }
 void VP8InitScanline(VP8Decoder* const dec) {
  VP8MB* const left = dec->mb_info_ - 1;
  left->nz_ = 0;
  left->nz_dc_ = 0;
  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
  dec->mb_x_ = 0;
 }
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
    // Parse bitstream for this row.
    VP8BitReader* const token_br =
        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                         "Premature end-of-partition0 encountered.");
    }
    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
      if (!VP8DecodeMB(dec, token_br)) {
        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                           "Premature end-of-file encountered.");
      }
    }
    VP8InitScanline(dec);   // Prepare for next scanline
    // Reconstruct, filter and emit the row.
    if (!VP8ProcessRow(dec, io)) {
      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
    }
  }
  return 1;
 }
 // Main entry point
 int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
  int ok = 0;
  if (dec == NULL) {
    return 0;
  }
  if (io == NULL) {
    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                       "NULL VP8Io parameter in VP8Decode().");
  }
  if (!dec->ready_) {
    if (!VP8GetHeaders(dec, io)) {
      return 0;
    }
  }
  assert(dec->ready_);
  // Finish setting up the decoding parameter. Will call io->setup().
  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
  if (ok) {   // good to go.
    // Will allocate memory and prepare everything.
    if (ok) ok = VP8InitFrame(dec, io);
    // Main decoding loop
    if (ok) ok = ParseFrame(dec, io);
    // Exit.
    ok &= VP8ExitCritical(dec, io);
  }
  if (!ok) {
    VP8Clear(dec);
    return 0;
  }
  dec->ready_ = 0;
  return ok;
 }
 void VP8Clear(VP8Decoder* const dec) {
  if (dec == NULL) {
    return;
  }
  ALPHDelete(dec->alph_dec_);
  dec->alph_dec_ = NULL;
  free(dec->mem_);
  dec->mem_ = NULL;
  dec->mem_size_ = 0;
  memset(&dec->br_, 0, sizeof(dec->br_));
  dec->ready_ = 0;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dec/vp8i.h
+++ b/src/loaders/webp/dec/vp8i.h
@ -0,0 +1,307 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // VP8 decoder: internal header.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_DEC_VP8I_H_
 #define WEBP_DEC_VP8I_H_
 #include <string.h>     // for memcpy()
 #include "./common.h"
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
 #include "../utils/random.h"
 #include "../dsp/dsp.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Various defines and enums
 // version numbers
 #define DEC_MAJ_VERSION 0
 #define DEC_MIN_VERSION 4
 #define DEC_REV_VERSION 3
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
 // and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
 // in order to be SIMD-friendly. We also need to store the top, left and
 // top-left samples (from previously decoded blocks), along with four
 // extra top-right samples for luma (intra4x4 prediction only).
 // One possible layout is, using 32 * (17 + 9) bytes:
 //
 //   .+------   <- only 1 pixel high
 //   .|yyyyt.
 //   .|yyyyt.
 //   .|yyyyt.
 //   .|yyyy..
 //   .+--.+--   <- only 1 pixel high
 //   .|uu.|vv
 //   .|uu.|vv
 //
 // Every character is a 4x4 block, with legend:
 //  '.' = unused
 //  'y' = y-samples   'u' = u-samples     'v' = u-samples
 //  '|' = left sample,   '-' = top sample,    '+' = top-left sample
 //  't' = extra top-right sample for 4x4 modes
 #define YUV_SIZE (BPS * 17 + BPS * 9)
 #define Y_SIZE   (BPS * 17)
 #define Y_OFF    (BPS * 1 + 8)
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 // minimal width under which lossy multi-threading is always disabled
 #define MIN_WIDTH_FOR_THREADS 512
 //------------------------------------------------------------------------------
 // Headers
 typedef struct {
  uint8_t key_frame_;
  uint8_t profile_;
  uint8_t show_;
  uint32_t partition_length_;
 } VP8FrameHeader;
 typedef struct {
  uint16_t width_;
  uint16_t height_;
  uint8_t xscale_;
  uint8_t yscale_;
  uint8_t colorspace_;   // 0 = YCbCr
  uint8_t clamp_type_;
 } VP8PictureHeader;
 // segment features
 typedef struct {
  int use_segment_;
  int update_map_;        // whether to update the segment map or not
  int absolute_delta_;    // absolute or delta values for quantizer and filter
  int8_t quantizer_[NUM_MB_SEGMENTS];        // quantization changes
  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;
 // probas associated to one of the contexts
 typedef uint8_t VP8ProbaArray[NUM_PROBAS];
 typedef struct {   // all the probas associated to one band
  VP8ProbaArray probas_[NUM_CTX];
 } VP8BandProbas;
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
  uint8_t segments_[MB_FEATURE_TREE_PROBS];
  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
  const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
 } VP8Proba;
 // Filter parameters
 typedef struct {
  int simple_;                  // 0=complex, 1=simple
  int level_;                   // [0..63]
  int sharpness_;               // [0..7]
  int use_lf_delta_;
  int ref_lf_delta_[NUM_REF_LF_DELTAS];
  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
 } VP8FilterHeader;
 //------------------------------------------------------------------------------
 // Informations about the macroblocks.
 typedef struct {  // filter specs
  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
  uint8_t f_ilevel_;     // inner limit in [1..63]
  uint8_t f_inner_;      // do inner filtering?
  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
 } VP8FInfo;
 typedef struct {  // Top/Left Contexts used for syntax-parsing
  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
 } VP8MB;
 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
  quant_t y1_mat_, y2_mat_, uv_mat_;
  int uv_quant_;   // U/V quantizer value
  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 // Data needed to reconstruct a macroblock
 typedef struct {
  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
  uint8_t is_i4x4_;       // true if intra4x4
  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
  uint8_t uvmode_;        // chroma prediction mode
  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
  //   code=0 -> no coefficient
  //   code=1 -> only DC
  //   code=2 -> first three coefficients are non-zero
  //   code=3 -> more than three coefficients are non-zero
  // This allows to call specialized transform functions.
  uint32_t non_zero_y_;
  uint32_t non_zero_uv_;
  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
  uint8_t skip_;
  uint8_t segment_;
 } VP8MBData;
 // Persistent information needed by the parallel processing
 typedef struct {
  int id_;              // cache row to process (in [0..2])
  int mb_y_;            // macroblock position of the row
  int filter_row_;      // true if row-filtering is needed
  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 // Saved top samples, per macroblock. Fits into a cache-line.
 typedef struct {
  uint8_t y[16], u[8], v[8];
 } VP8TopSamples;
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
 struct VP8Decoder {
  VP8StatusCode status_;
  int ready_;     // true if ready to decode a picture with VP8Decode()
  const char* error_msg_;  // set when status_ is not OK.
  // Main data source
  VP8BitReader br_;
  // headers
  VP8FrameHeader   frm_hdr_;
  VP8PictureHeader pic_hdr_;
  VP8FilterHeader  filter_hdr_;
  VP8SegmentHeader segment_hdr_;
  int cache_id_;       // current cache row
  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
  VP8ThreadContext thread_ctx_;  // Thread context
  // dimension, in macroblock units.
  int mb_w_, mb_h_;
  // Macroblock to process/filter, depending on cropping and filter_type.
  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
  // number of partitions.
  int num_parts_;
  // per-partition boolean decoders.
  VP8BitReader parts_[MAX_NUM_PARTITIONS];
  // Dithering strength, deduced from decoding options
  int dither_;                // whether to use dithering or not
  VP8Random dithering_rg_;    // random generator for dithering
  // dequantization (one set of DC/AC dequant factor per segment)
  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
  // probabilities
  VP8Proba proba_;
  int use_skip_proba_;
  uint8_t skip_p_;
  // Boundary data cache and persistent buffers.
  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
  uint8_t  intra_l_[4];   // left intra modes values
  VP8TopSamples* yuv_t_;  // top y/u/v samples
  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
  VP8FInfo* f_info_;      // filter strength info
  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
  uint8_t* cache_u_;
  uint8_t* cache_v_;
  int cache_y_stride_;
  int cache_uv_stride_;
  // main memory chunk for the above data. Persistent.
  void* mem_;
  size_t mem_size_;
  // Per macroblock non-persistent infos.
  int mb_x_, mb_y_;       // current position, in macroblock units
  VP8MBData* mb_data_;    // parsed reconstruction data
  // Filtering side-info
  int filter_type_;                          // 0=off, 1=simple, 2=complex
  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
  // Alpha
  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
  const uint8_t* alpha_data_;     // compressed alpha data (if present)
  size_t alpha_data_size_;
  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
 };
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 // in vp8.c
 int VP8SetError(VP8Decoder* const dec,
                VP8StatusCode error, const char* const msg);
 // in tree.c
 void VP8ResetProba(VP8Proba* const proba);
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
 // parses one row of intra mode data in partition 0, returns !eof
 int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
 // in quant.c
 void VP8ParseQuant(VP8Decoder* const dec);
 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
 // if ok, otherwise sets and returns the error status on *dec.
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
 // Initialize dithering post-process if needed.
 void VP8InitDithering(const WebPDecoderOptions* const options,
                      VP8Decoder* const dec);
 // Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
 // Decode one macroblock. Returns false if there is not enough data.
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
 // in alpha.c
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                      int row, int num_rows);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DEC_VP8I_H_ */
--- a/src/loaders/webp/dec/vp8l.cpp
+++ b/src/loaders/webp/dec/vp8l.cpp
--- a/src/loaders/webp/dec/vp8li.h
+++ b/src/loaders/webp/dec/vp8li.h
@ -0,0 +1,136 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Lossless decoder: internal header.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora(vikaas.arora@gmail.com)
 #ifndef WEBP_DEC_VP8LI_H_
 #define WEBP_DEC_VP8LI_H_
 #include <string.h>     // for memcpy()
 #include "./webpi.h"
 #include "../utils/bit_reader.h"
 #include "../utils/color_cache.h"
 #include "../utils/huffman.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef enum {
  READ_DATA = 0,
  READ_HDR = 1,
  READ_DIM = 2
 } VP8LDecodeState;
 typedef struct VP8LTransform VP8LTransform;
 struct VP8LTransform {
  VP8LImageTransformType type_;   // transform type.
  int                    bits_;   // subsampling bits defining transform window.
  int                    xsize_;  // transform window X index.
  int                    ysize_;  // transform window Y index.
  uint32_t              *data_;   // transform data.
 };
 typedef struct {
  int             color_cache_size_;
  VP8LColorCache  color_cache_;
  VP8LColorCache  saved_color_cache_;  // for incremental
  int             huffman_mask_;
  int             huffman_subsample_bits_;
  int             huffman_xsize_;
  uint32_t       *huffman_image_;
  int             num_htree_groups_;
  HTreeGroup     *htree_groups_;
  HuffmanCode    *huffman_tables_;
 } VP8LMetadata;
 typedef struct VP8LDecoder VP8LDecoder;
 struct VP8LDecoder {
  VP8StatusCode    status_;
  VP8LDecodeState  state_;
  VP8Io           *io_;
  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
                                   // or uint32_t* for BGRA.
  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
  VP8LBitReader    br_;
  int              incremental_;   // if true, incremental decoding is expected
  VP8LBitReader    saved_br_;      // note: could be local variables too
  int              saved_last_pixel_;
  int              width_;
  int              height_;
  int              last_row_;      // last input row decoded so far.
  int              last_pixel_;    // last pixel decoded so far. However, it may
                                   // not be transformed, scaled and
                                   // color-converted yet.
  int              last_out_row_;  // last row output so far.
  VP8LMetadata     hdr_;
  int              next_transform_;
  VP8LTransform    transforms_[NUM_TRANSFORMS];
  // or'd bitset storing the transforms types.
  uint32_t         transforms_seen_;
  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
  WebPRescaler    *rescaler;         // Common rescaler for all channels.
 };
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 struct ALPHDecoder;  // Defined in dec/alphai.h.
 // in vp8l.c
 // Decodes image header for alpha data stored using lossless compression.
 // Returns false in case of error.
 int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
                          const uint8_t* const data, size_t data_size,
                          uint8_t* const output);
 // Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
 // already decoded in previous call(s), it will resume decoding from where it
 // was paused.
 // Returns false in case of bitstream error.
 int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
                               int last_row);
 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
 // Decodes the image header. Returns false in case of error.
 int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
 // Decodes an image. It's required to decode the lossless header before calling
 // this function. Returns false in case of error, with updated dec->status_.
 int VP8LDecodeImage(VP8LDecoder* const dec);
 // Resets the decoder in its initial state, reclaiming memory.
 // Preserves the dec->status_ value.
 void VP8LClear(VP8LDecoder* const dec);
 // Clears and deallocate a lossless decoder instance.
 void VP8LDelete(VP8LDecoder* const dec);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DEC_VP8LI_H_ */
--- a/src/loaders/webp/dec/webp.cpp
+++ b/src/loaders/webp/dec/webp.cpp
@ -0,0 +1,674 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Main decoding functions for WEBP images.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <stdlib.h>
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 // VP8X Feature Flags.
 typedef enum WebPFeatureFlags {
  FRAGMENTS_FLAG  = 0x00000001,
  ANIMATION_FLAG  = 0x00000002,
  XMP_FLAG        = 0x00000004,
  EXIF_FLAG       = 0x00000008,
  ALPHA_FLAG      = 0x00000010,
  ICCP_FLAG       = 0x00000020
 } WebPFeatureFlags;
 //------------------------------------------------------------------------------
 // RIFF layout is:
 //   Offset  tag
 //   0...3   "RIFF" 4-byte tag
 //   4...7   size of image data (including metadata) starting at offset 8
 //   8...11  "WEBP"   our form-type signature
 // The RIFF container (12 bytes) is followed by appropriate chunks:
 //   12..15  "VP8 ": 4-bytes tags, signaling the use of VP8 video format
 //   16..19  size of the raw VP8 image data, starting at offset 20
 //   20....  the VP8 bytes
 // Or,
 //   12..15  "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
 //   16..19  size of the raw VP8L image data, starting at offset 20
 //   20....  the VP8L bytes
 // Or,
 //   12..15  "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
 //   16..19  size of the VP8X chunk starting at offset 20.
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
 // There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
 // VP8L, XMP, EXIF  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.
 static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
  return data[0] | (data[1] << 8) | (data[2] << 16);
 }
 static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
  return (uint32_t)get_le24(data) | (data[3] << 24);
 }
 // Validates the RIFF container (if detected) and skips over it.
 // If a RIFF container is detected, returns:
 //     VP8_STATUS_BITSTREAM_ERROR for invalid header,
 //     VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
 // and VP8_STATUS_OK otherwise.
 // In case there are not enough bytes (partial RIFF container), return 0 for
 // *riff_size. Else return the RIFF size extracted from the header.
 static VP8StatusCode ParseRIFF(const uint8_t** const data,
                               size_t* const data_size, int have_all_data,
                               size_t* const riff_size) {
  assert(data != NULL);
  assert(data_size != NULL);
  assert(riff_size != NULL);
  *riff_size = 0;  // Default: no RIFF present.
  if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
    } else {
      const uint32_t size = get_le32(*data + TAG_SIZE);
      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
        return VP8_STATUS_BITSTREAM_ERROR;
      }
      if (size > MAX_CHUNK_PAYLOAD) {
        return VP8_STATUS_BITSTREAM_ERROR;
      }
      if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
        return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
      }
      // We have a RIFF container. Skip it.
      *riff_size = size;
      *data += RIFF_HEADER_SIZE;
      *data_size -= RIFF_HEADER_SIZE;
    }
  }
  return VP8_STATUS_OK;
 }
 // Validates the VP8X header and skips over it.
 // Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
 //         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
 //         VP8_STATUS_OK otherwise.
 // If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
 // *height_ptr and *flags_ptr are set to the corresponding values extracted
 // from the VP8X chunk.
 static VP8StatusCode ParseVP8X(const uint8_t** const data,
                               size_t* const data_size,
                               int* const found_vp8x,
                               int* const width_ptr, int* const height_ptr,
                               uint32_t* const flags_ptr) {
  const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
  assert(data != NULL);
  assert(data_size != NULL);
  assert(found_vp8x != NULL);
  *found_vp8x = 0;
  if (*data_size < CHUNK_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
  }
  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
    int width, height;
    uint32_t flags;
    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
    if (chunk_size != VP8X_CHUNK_SIZE) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
    }
    // Verify if enough data is available to validate the VP8X chunk.
    if (*data_size < vp8x_size) {
      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
    }
    flags = get_le32(*data + 8);
    width = 1 + get_le24(*data + 12);
    height = 1 + get_le24(*data + 15);
    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
    }
    if (flags_ptr != NULL) *flags_ptr = flags;
    if (width_ptr != NULL) *width_ptr = width;
    if (height_ptr != NULL) *height_ptr = height;
    // Skip over VP8X header bytes.
    *data += vp8x_size;
    *data_size -= vp8x_size;
    *found_vp8x = 1;
  }
  return VP8_STATUS_OK;
 }
 // Skips to the next VP8/VP8L chunk header in the data given the size of the
 // RIFF chunk 'riff_size'.
 // Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
 //         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
 //         VP8_STATUS_OK otherwise.
 // If an alpha chunk is found, *alpha_data and *alpha_size are set
 // appropriately.
 static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
                                         size_t* const data_size,
                                         size_t const riff_size,
                                         const uint8_t** const alpha_data,
                                         size_t* const alpha_size) {
  const uint8_t* buf;
  size_t buf_size;
  uint32_t total_size = TAG_SIZE +           // "WEBP".
                        CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
                        VP8X_CHUNK_SIZE;     // data.
  assert(data != NULL);
  assert(data_size != NULL);
  buf = *data;
  buf_size = *data_size;
  assert(alpha_data != NULL);
  assert(alpha_size != NULL);
  *alpha_data = NULL;
  *alpha_size = 0;
  while (1) {
    uint32_t chunk_size;
    uint32_t disk_chunk_size;   // chunk_size with padding
    *data = buf;
    *data_size = buf_size;
    if (buf_size < CHUNK_HEADER_SIZE) {  // Insufficient data.
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
    chunk_size = get_le32(buf + TAG_SIZE);
    if (chunk_size > MAX_CHUNK_PAYLOAD) {
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }
    // For odd-sized chunk-payload, there's one byte padding at the end.
    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
    total_size += disk_chunk_size;
    // Check that total bytes skipped so far does not exceed riff_size.
    if (riff_size > 0 && (total_size > riff_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
    }
    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
    // parsed all the optional chunks.
    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
    // below to allow incomplete VP8/VP8L chunks.
    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
        !memcmp(buf, "VP8L", TAG_SIZE)) {
      return VP8_STATUS_OK;
    }
    if (buf_size < disk_chunk_size) {             // Insufficient data.
      return VP8_STATUS_NOT_ENOUGH_DATA;
    }
    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
      *alpha_data = buf + CHUNK_HEADER_SIZE;
      *alpha_size = chunk_size;
    }
    // We have a full and valid chunk; skip it.
    buf += disk_chunk_size;
    buf_size -= disk_chunk_size;
  }
 }
 // Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
 // Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
 //         riff_size) VP8/VP8L header,
 //         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
 //         VP8_STATUS_OK otherwise.
 // If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
 // extracted from the VP8/VP8L chunk header.
 // The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
 static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
                                    size_t* const data_size, int have_all_data,
                                    size_t riff_size, size_t* const chunk_size,
                                    int* const is_lossless) {
  const uint8_t* const data = *data_ptr;
  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
  const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
  const uint32_t minimal_size =
      TAG_SIZE + CHUNK_HEADER_SIZE;  // "WEBP" + "VP8 nnnn" OR
                                     // "WEBP" + "VP8Lnnnn"
  assert(data != NULL);
  assert(data_size != NULL);
  assert(chunk_size != NULL);
  assert(is_lossless != NULL);
  if (*data_size < CHUNK_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
  }
  if (is_vp8 || is_vp8l) {
    // Bitstream contains VP8/VP8L header.
    const uint32_t size = get_le32(data + TAG_SIZE);
    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
    }
    if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
      return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
    }
    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
    *chunk_size = size;
    *data_ptr += CHUNK_HEADER_SIZE;
    *data_size -= CHUNK_HEADER_SIZE;
    *is_lossless = is_vp8l;
  } else {
    // Raw VP8/VP8L bitstream (no header).
    *is_lossless = VP8LCheckSignature(data, *data_size);
    *chunk_size = *data_size;
  }
  return VP8_STATUS_OK;
 }
 //------------------------------------------------------------------------------
 // Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
 // 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
 // minimal amount will be read to fetch the remaining parameters.
 // If 'headers' is non-NULL this function will attempt to locate both alpha
 // data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
 // Note: The following chunk sequences (before the raw VP8/VP8L data) are
 // considered valid by this function:
 // RIFF + VP8(L)
 // RIFF + VP8X + (optional chunks) + VP8(L)
 // ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
 // VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
 static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                          size_t data_size,
                                          int* const width,
                                          int* const height,
                                          int* const has_alpha,
                                          int* const has_animation,
                                          int* const format,
                                          WebPHeaderStructure* const headers) {
  int canvas_width = 0;
  int canvas_height = 0;
  int image_width = 0;
  int image_height = 0;
  int found_riff = 0;
  int found_vp8x = 0;
  int animation_present = 0;
  int fragments_present = 0;
  const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
  VP8StatusCode status;
  WebPHeaderStructure hdrs;
  if (data == NULL || data_size < RIFF_HEADER_SIZE) {
    return VP8_STATUS_NOT_ENOUGH_DATA;
  }
  memset(&hdrs, 0, sizeof(hdrs));
  hdrs.data = data;
  hdrs.data_size = data_size;
  // Skip over RIFF header.
  status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
  if (status != VP8_STATUS_OK) {
    return status;   // Wrong RIFF header / insufficient data.
  }
  found_riff = (hdrs.riff_size > 0);
  // Skip over VP8X.
  {
    uint32_t flags = 0;
    status = ParseVP8X(&data, &data_size, &found_vp8x,
                       &canvas_width, &canvas_height, &flags);
    if (status != VP8_STATUS_OK) {
      return status;  // Wrong VP8X / insufficient data.
    }
    animation_present = !!(flags & ANIMATION_FLAG);
    fragments_present = !!(flags & FRAGMENTS_FLAG);
    if (!found_riff && found_vp8x) {
      // Note: This restriction may be removed in the future, if it becomes
      // necessary to send VP8X chunk to the decoder.
      return VP8_STATUS_BITSTREAM_ERROR;
    }
    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
    if (has_animation != NULL) *has_animation = animation_present;
    if (format != NULL) *format = 0;   // default = undefined
    image_width = canvas_width;
    image_height = canvas_height;
    if (found_vp8x && (animation_present || fragments_present) &&
        headers == NULL) {
      status = VP8_STATUS_OK;
      goto ReturnWidthHeight;  // Just return features from VP8X header.
    }
  }
  if (data_size < TAG_SIZE) {
    status = VP8_STATUS_NOT_ENOUGH_DATA;
    goto ReturnWidthHeight;
  }
  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
  if ((found_riff && found_vp8x) ||
      (!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
    if (status != VP8_STATUS_OK) {
      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
    }
  }
  // Skip over VP8/VP8L header.
  status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
                          &hdrs.compressed_size, &hdrs.is_lossless);
  if (status != VP8_STATUS_OK) {
    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
  }
  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
    return VP8_STATUS_BITSTREAM_ERROR;
  }
  if (format != NULL && !(animation_present || fragments_present)) {
    *format = hdrs.is_lossless ? 2 : 1;
  }
  if (!hdrs.is_lossless) {
    if (data_size < VP8_FRAME_HEADER_SIZE) {
      status = VP8_STATUS_NOT_ENOUGH_DATA;
      goto ReturnWidthHeight;
    }
    // Validates raw VP8 data.
    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
                    &image_width, &image_height)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  } else {
    if (data_size < VP8L_FRAME_HEADER_SIZE) {
      status = VP8_STATUS_NOT_ENOUGH_DATA;
      goto ReturnWidthHeight;
    }
    // Validates raw VP8L data.
    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  }
  // Validates image size coherency.
  if (found_vp8x) {
    if (canvas_width != image_width || canvas_height != image_height) {
      return VP8_STATUS_BITSTREAM_ERROR;
    }
  }
  if (headers != NULL) {
    *headers = hdrs;
    headers->offset = data - headers->data;
    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
    assert(headers->offset == headers->data_size - data_size);
  }
 ReturnWidthHeight:
  if (status == VP8_STATUS_OK ||
      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
    if (has_alpha != NULL) {
      // If the data did not contain a VP8X/VP8L chunk the only definitive way
      // to set this is by looking for alpha data (from an ALPH chunk).
      *has_alpha |= (hdrs.alpha_data != NULL);
    }
    if (width != NULL) *width = image_width;
    if (height != NULL) *height = image_height;
    return VP8_STATUS_OK;
  } else {
    return status;
  }
 }
 // Skips over all valid chunks prior to the first VP8/VP8L frame header.
 // Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
 // VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
 // in the case of non-decodable features (animation for instance).
 // In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
 // fields are updated appropriately upon success.
 static VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
  VP8StatusCode status;
  int has_animation = 0;
  assert(headers != NULL);
  // fill out headers, ignore width/height/has_alpha.
  status = ParseHeadersInternal(headers->data, headers->data_size,
                                NULL, NULL, NULL, &has_animation,
                                NULL, headers);
  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
    // TODO(jzern): full support of animation frames will require API additions.
    if (has_animation) {
      status = VP8_STATUS_UNSUPPORTED_FEATURE;
    }
  }
  return status;
 }
 //------------------------------------------------------------------------------
 // "Into" decoding variants
 // Main flow
 static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
                                WebPDecParams* const params) {
  VP8StatusCode status;
  VP8Io io;
  WebPHeaderStructure headers;
  headers.data = data;
  headers.data_size = data_size;
  headers.have_all_data = 1;
  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
  if (status != VP8_STATUS_OK) {
    return status;
  }
  assert(params != NULL);
  VP8InitIo(&io);
  io.data = headers.data + headers.offset;
  io.data_size = headers.data_size - headers.offset;
  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
  if (!headers.is_lossless) {
    VP8Decoder* const dec = VP8New();
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    dec->alpha_data_ = headers.alpha_data;
    dec->alpha_data_size_ = headers.alpha_data_size;
    // Decode bitstream header, update io->width/io->height.
    if (!VP8GetHeaders(dec, &io)) {
      status = dec->status_;   // An error occurred. Grab error status.
    } else {
      // Allocate/check output buffers.
      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                     params->output);
      if (status == VP8_STATUS_OK) {  // Decode
        VP8InitDithering(params->options, dec);
        if (!VP8Decode(dec, &io)) {
          status = dec->status_;
        }
      }
    }
    VP8Delete(dec);
  } else {
    VP8LDecoder* const dec = VP8LNew();
    if (dec == NULL) {
      return VP8_STATUS_OUT_OF_MEMORY;
    }
    if (!VP8LDecodeHeader(dec, &io)) {
      status = dec->status_;   // An error occurred. Grab error status.
    } else {
      // Allocate/check output buffers.
      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                     params->output);
      if (status == VP8_STATUS_OK) {  // Decode
        if (!VP8LDecodeImage(dec)) {
          status = dec->status_;
        }
      }
    }
    VP8LDelete(dec);
  }
  if (status != VP8_STATUS_OK) {
    WebPFreeDecBuffer(params->output);
  }
  if (params->options != NULL && params->options->flip) {
    status = WebPFlipBuffer(params->output);
  }
  return status;
 }
 //------------------------------------------------------------------------------
 static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
                       size_t data_size, int* const width, int* const height,
                       WebPDecBuffer* const keep_info) {
  WebPDecParams params;
  WebPDecBuffer output;
  WebPInitDecBuffer(&output);
  memset(&params, 0, sizeof(params));
  params.output = &output;
  output.colorspace = mode;
  // Retrieve (and report back) the required dimensions from bitstream.
  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
    return NULL;
  }
  if (width != NULL) *width = output.width;
  if (height != NULL) *height = output.height;
  // Decode
  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
    return NULL;
  }
  if (keep_info != NULL) {    // keep track of the side-info
    WebPCopyDecBuffer(&output, keep_info);
  }
  // return decoded samples (don't clear 'output'!)
  return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
 }
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
  assert(features != NULL);
  memset(features, 0, sizeof(*features));
 }
 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
                                 WebPBitstreamFeatures* const features) {
  if (features == NULL || data == NULL) {
    return VP8_STATUS_INVALID_PARAM;
  }
  DefaultFeatures(features);
  // Only parse enough of the data to retrieve the features.
  return ParseHeadersInternal(data, data_size,
                              &features->width, &features->height,
                              &features->has_alpha, &features->has_animation,
                              &features->format, NULL);
 }
 //------------------------------------------------------------------------------
 // Cropping and rescaling.
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                          VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
  const int W = io->width;
  const int H = io->height;
  int x = 0, y = 0, w = W, h = H;
  // Cropping
  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
  if (io->use_cropping) {
    w = options->crop_width;
    h = options->crop_height;
    x = options->crop_left;
    y = options->crop_top;
    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420
      x &= ~1;
      y &= ~1;
    }
    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
      return 0;  // out of frame boundary error
    }
  }
  io->crop_left   = x;
  io->crop_top    = y;
  io->crop_right  = x + w;
  io->crop_bottom = y + h;
  io->mb_w = w;
  io->mb_h = h;
  // Scaling
  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
  if (io->use_scaling) {
    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
      return 0;
    }
    io->scaled_width = options->scaled_width;
    io->scaled_height = options->scaled_height;
  }
  // Filter
  io->bypass_filtering = options && options->bypass_filtering;
  // Fancy upsampler
 #ifdef FANCY_UPSAMPLING
  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
 #endif
  if (io->use_scaling) {
    // disable filter (only for large downscaling ratio).
    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
                           (io->scaled_height < H * 3 / 4);
    io->fancy_upsampling = 0;
  }
  return 1;
 }
 //------------------------------------------------------------------------------
 /************************************************************************/
 /* External Class Implementation                                        */
 /************************************************************************/
 uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
                        int* width, int* height) {
  return Decode(MODE_BGRA, data, data_size, width, height, NULL);
 }
 int WebPGetInfo(const uint8_t* data, size_t data_size,
                int* width, int* height) {
  WebPBitstreamFeatures features;
  if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) {
    return 0;
  }
  if (width != NULL) {
    *width  = features.width;
  }
  if (height != NULL) {
    *height = features.height;
  }
  return 1;
 }
--- a/src/loaders/webp/dec/webpi.h
+++ b/src/loaders/webp/dec/webpi.h
@ -0,0 +1,108 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Internal header: WebP decoding parameters and custom IO on buffer
 //
 // Author: somnath@google.com (Somnath Banerjee)
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "../utils/rescaler.h"
 #include "./decode_vp8.h"
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
 typedef struct WebPDecParams WebPDecParams;
 typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
 typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
 struct WebPDecParams {
  WebPDecBuffer* output;             // output buffer.
  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
                                     // or used for tmp rescaling
  int last_y;                 // coordinate of the line that was last output
  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
  // rescalers
  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
  void* memory;                  // overall scratch memory for the output work.
  OutputFunc emit;               // output RGB or YUV samples
  OutputFunc emit_alpha;         // output alpha channel
  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
 };
 //------------------------------------------------------------------------------
 // Header parsing helpers
 // Structure storing a description of the RIFF headers.
 typedef struct {
  const uint8_t* data;         // input buffer
  size_t data_size;            // input buffer size
  int have_all_data;           // true if all data is known to be available
  size_t offset;               // offset to main data chunk (VP8 or VP8L)
  const uint8_t* alpha_data;   // points to alpha chunk (if present)
  size_t alpha_data_size;      // alpha chunk size
  size_t compressed_size;      // VP8/VP8L compressed data size
  size_t riff_size;            // size of the riff payload (or 0 if absent)
  int is_lossless;             // true if a VP8L chunk is present
 } WebPHeaderStructure;
 //------------------------------------------------------------------------------
 // Misc utils
 // Initializes VP8Io with custom setup, io and teardown functions. The default
 // hooks will use the supplied 'params' as io->opaque handle.
 void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
 // Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
 // to the *compressed* format, not the output one.
 int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
 //------------------------------------------------------------------------------
 // Internal functions regarding WebPDecBuffer memory (in buffer.c).
 // Don't really need to be externally visible for now.
 // Prepare 'buffer' with the requested initial dimensions width/height.
 // If no external storage is supplied, initializes buffer by allocating output
 // memory and setting up the stride information. Validate the parameters. Return
 // an error code in case of problem (no memory, or invalid stride / size /
 // dimension / etc.). If *options is not NULL, also verify that the options'
 // parameters are valid and apply them to the width/height dimensions of the
 // output buffer. This takes cropping / scaling / rotation into account.
 // Also incorporates the options->flip flag to flip the buffer parameters if
 // needed.
 VP8StatusCode WebPAllocateDecBuffer(int width, int height,
                                    const WebPDecoderOptions* const options,
                                    WebPDecBuffer* const buffer);
 // Flip buffer vertically by negating the various strides.
 VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
 // Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
 // memory (still held by 'src').
 void WebPCopyDecBuffer(const WebPDecBuffer* const src,
                       WebPDecBuffer* const dst);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DEC_WEBPI_H_ */
--- a/src/loaders/webp/dsp/alpha_processing.cpp
+++ b/src/loaders/webp/dsp/alpha_processing.cpp
@ -0,0 +1,377 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for processing transparent channel.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include "./dsp.h"
 // Tables can be faster on some platform but incur some extra binary size (~2k).
 // #define USE_TABLES_FOR_ALPHA_MULT
 // -----------------------------------------------------------------------------
 #define MFIX 24    // 24bit fixed-point arithmetic
 #define HALF ((1u << MFIX) >> 1)
 #define KINV_255 ((1u << MFIX) / 255u)
 static uint32_t Mult(uint8_t x, uint32_t mult) {
  const uint32_t v = (x * mult + HALF) >> MFIX;
  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
  return v;
 }
 #ifdef USE_TABLES_FOR_ALPHA_MULT
 static const uint32_t kMultTables[2][256] = {
  {    // (255u << MFIX) / alpha
    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
  {   // alpha * KINV_255
    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
 };
 static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return kMultTables[!inverse][a];
 }
 #else
 static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
  return inverse ? (255u << MFIX) / a : a * KINV_255;
 }
 #endif    // USE_TABLES_FOR_ALPHA_MULT
 void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t argb = ptr[x];
    if (argb < 0xff000000u) {      // alpha < 255
      if (argb <= 0x00ffffffu) {   // alpha == 0
        ptr[x] = 0;
      } else {
        const uint32_t alpha = (argb >> 24) & 0xff;
        const uint32_t scale = GetScale(alpha, inverse);
        uint32_t out = argb & 0xff000000u;
        out |= Mult(argb >>  0, scale) <<  0;
        out |= Mult(argb >>  8, scale) <<  8;
        out |= Mult(argb >> 16, scale) << 16;
        ptr[x] = out;
      }
    }
  }
 }
 void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
                  int width, int inverse) {
  int x;
  for (x = 0; x < width; ++x) {
    const uint32_t a = alpha[x];
    if (a != 255) {
      if (a == 0) {
        ptr[x] = 0;
      } else {
        const uint32_t scale = GetScale(a, inverse);
        ptr[x] = Mult(ptr[x], scale);
      }
    }
  }
 }
 #undef KINV_255
 #undef HALF
 #undef MFIX
 void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
 void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse);
 //------------------------------------------------------------------------------
 // Generic per-plane calls
 void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                      int inverse) {
  int n;
  for (n = 0; n < num_rows; ++n) {
    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
    ptr += stride;
  }
 }
 void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse) {
  int n;
  for (n = 0; n < num_rows; ++n) {
    WebPMultRow(ptr, alpha, width, inverse);
    ptr += stride;
    alpha += alpha_stride;
  }
 }
 //------------------------------------------------------------------------------
 // Premultiplied modes
 // non dithered-modes
 // (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
 // for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
 // one can use instead: (x * a * 65793 + (1 << 23)) >> 24
 #if 1     // (int)(x * a / 255.)
 #define MULTIPLIER(a)   ((a) * 32897U)
 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
 #else     // (int)(x * a / 255. + .5)
 #define MULTIPLIER(a) ((a) * 65793U)
 #define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
 #endif
 static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
                               int w, int h, int stride) {
  while (h-- > 0) {
    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
    int i;
    for (i = 0; i < w; ++i) {
      const uint32_t a = alpha[4 * i];
      if (a != 0xff) {
        const uint32_t mult = MULTIPLIER(a);
        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
      }
    }
    rgba += stride;
  }
 }
 #undef MULTIPLIER
 #undef PREMULTIPLY
 // rgbA4444
 #define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
 static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
  return (x & 0xf0) | (x >> 4);
 }
 static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
  return (x & 0x0f) | (x << 4);
 }
 static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
  return (x * m) >> 16;
 }
 static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
                                               int w, int h, int stride,
                                               int rg_byte_pos /* 0 or 1 */) {
  while (h-- > 0) {
    int i;
    for (i = 0; i < w; ++i) {
      const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
      const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
      const uint8_t a = ba & 0x0f;
      const uint32_t mult = MULTIPLIER(a);
      const uint8_t r = multiply(dither_hi(rg), mult);
      const uint8_t g = multiply(dither_lo(rg), mult);
      const uint8_t b = multiply(dither_hi(ba), mult);
      rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
      rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
    }
    rgba4444 += stride;
  }
 }
 #undef MULTIPLIER
 static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
                                   int w, int h, int stride) {
 #ifdef WEBP_SWAP_16BIT_CSP
  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
 #else
  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
 #endif
 }
 static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
                         int width, int height,
                         uint8_t* dst, int dst_stride) {
  uint32_t alpha_mask = 0xff;
  int i, j;
  for (j = 0; j < height; ++j) {
    for (i = 0; i < width; ++i) {
      const uint32_t alpha_value = alpha[i];
      dst[4 * i] = alpha_value;
      alpha_mask &= alpha_value;
    }
    alpha += alpha_stride;
    dst += dst_stride;
  }
  return (alpha_mask != 0xff);
 }
 static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
                                 int width, int height,
                                 uint32_t* dst, int dst_stride) {
  int i, j;
  for (j = 0; j < height; ++j) {
    for (i = 0; i < width; ++i) {
      dst[i] = alpha[i] << 8;  // leave A/R/B channels zero'd.
    }
    alpha += alpha_stride;
    dst += dst_stride;
  }
 }
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
  uint8_t alpha_mask = 0xff;
  int i, j;
  for (j = 0; j < height; ++j) {
    for (i = 0; i < width; ++i) {
      const uint8_t alpha_value = argb[4 * i];
      alpha[i] = alpha_value;
      alpha_mask &= alpha_value;
    }
    argb += argb_stride;
    alpha += alpha_stride;
  }
  return (alpha_mask == 0xff);
 }
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
 int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 //------------------------------------------------------------------------------
 // Init function
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPMultARGBRow = WebPMultARGBRowC;
  WebPMultRow = WebPMultRowC;
  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
  WebPDispatchAlpha = DispatchAlpha;
  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
  WebPExtractAlpha = ExtractAlpha;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitAlphaProcessingSSE2();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitAlphaProcessingMIPSdspR2();
    }
 #endif
  }
  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/loaders/webp/dsp/argb.cpp
+++ b/src/loaders/webp/dsp/argb.cpp
@ -0,0 +1,68 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   ARGB making functions.
 //
 // Author: Djordje Pesut (djordje.pesut@imgtec.com)
 #include "./dsp.h"
 static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
 }
 static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                     const uint8_t* b, int len, uint32_t* out) {
  int i;
  for (i = 0; i < len; ++i) {
    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
  }
 }
 static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                    int len, int step, uint32_t* out) {
  int i, offset = 0;
  for (i = 0; i < len; ++i) {
    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
    offset += step;
  }
 }
 void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
                    const uint8_t*, int, uint32_t*);
 void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
                   int, int, uint32_t*);
 extern void VP8EncDspARGBInitMIPSdspR2(void);
 extern void VP8EncDspARGBInitSSE2(void);
 static volatile VP8CPUInfo argb_last_cpuinfo_used =
    (VP8CPUInfo)&argb_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
  VP8PackARGB = PackARGB;
  VP8PackRGB = PackRGB;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8EncDspARGBInitSSE2();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8EncDspARGBInitMIPSdspR2();
    }
 #endif
  }
  argb_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/loaders/webp/dsp/cpu.cpp
+++ b/src/loaders/webp/dsp/cpu.cpp
@ -0,0 +1,120 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // CPU detection
 //
 // Author: Christian Duvivier (cduvivier@google.com)
 #include "./dsp.h"
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
 // apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
 #if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "mov %%ebx, %%edi\n"
    "cpuid\n"
    "xchg %%edi, %%ebx\n"
    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
  __asm__ volatile (
    "cpuid\n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
 #elif (defined(_M_X64) || defined(_M_IX86)) && \
      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
 #include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
 #elif defined(WEBP_MSC_SSE2)
 #define GetCPUInfo __cpuid
 #endif
 // NaCl has no support for xgetbv or the raw opcode.
 #if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
 static WEBP_INLINE uint64_t xgetbv(void) {
  const uint32_t ecx = 0;
  uint32_t eax, edx;
  // Use the raw opcode for xgetbv for compatibility with older toolchains.
  __asm__ volatile (
    ".byte 0x0f, 0x01, 0xd0\n"
    : "=a"(eax), "=d"(edx) : "c" (ecx));
  return ((uint64_t)edx << 32) | eax;
 }
 #elif (defined(_M_X64) || defined(_M_IX86)) && \
      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
 #include <immintrin.h>
 #define xgetbv() _xgetbv(0)
 #elif defined(_MSC_VER) && defined(_M_IX86)
 static WEBP_INLINE uint64_t xgetbv(void) {
  uint32_t eax_, edx_;
  __asm {
    xor ecx, ecx  // ecx = 0
    // Use the raw opcode for xgetbv for compatibility with older toolchains.
    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
    mov eax_, eax
    mov edx_, edx
  }
  return ((uint64_t)edx_ << 32) | eax_;
 }
 #else
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
  int cpu_info[4];
  GetCPUInfo(cpu_info, 1);
  if (feature == kSSE2) {
    return 0 != (cpu_info[3] & 0x04000000);
  }
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
 #elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
 static int AndroidCPUInfo(CPUFeature feature) {
  const AndroidCpuFamily cpu_family = android_getCpuFamily();
  const uint64_t cpu_features = android_getCpuFeatures();
  if (feature == kNEON) {
    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
  }
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
 #elif defined(WEBP_USE_NEON)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
  (void)feature;
  return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 #elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
 static int mipsCPUInfo(CPUFeature feature) {
  if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
    return 1;
  } else {
    return 0;
  }
 }
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
--- a/src/loaders/webp/dsp/dec.cpp
+++ b/src/loaders/webp/dsp/dec.cpp
@ -0,0 +1,766 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Speed-critical decoding functions, default plain-C implementations.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 #include "../dec/vp8i.h"
 //------------------------------------------------------------------------------
 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 #define STORE(x, y, v) \
  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
 #define STORE2(y, dc, d, c) do {    \
  const int DC = (dc);              \
  STORE(0, y, DC + (d));            \
  STORE(1, y, DC + (c));            \
  STORE(2, y, DC - (c));            \
  STORE(3, y, DC - (d));            \
 } while (0)
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
 static void TransformOne(const int16_t* in, uint8_t* dst) {
  int C[4 * 4], *tmp;
  int i;
  tmp = C;
  for (i = 0; i < 4; ++i) {    // vertical pass
    const int a = in[0] + in[8];    // [-4096, 4094]
    const int b = in[0] - in[8];    // [-4095, 4095]
    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
    tmp[0] = a + d;   // [-7881, 7875]
    tmp[1] = b + c;   // [-7878, 7878]
    tmp[2] = b - c;   // [-7878, 7878]
    tmp[3] = a - d;   // [-7877, 7879]
    tmp += 4;
    in++;
  }
  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
  // The exact value is (2. + (kC1 + kC2) / 65536).
  // After the second pass, maximum interval is [-3794, 3794], assuming
  // an input in [-2048, 2047] interval. We then need to add a dst value
  // in the [0, 255] range.
  // In the worst case scenario, the input to clip_8b() can be as large as
  // [-60713, 60968].
  tmp = C;
  for (i = 0; i < 4; ++i) {    // horizontal pass
    const int dc = tmp[0] + 4;
    const int a =  dc +  tmp[8];
    const int b =  dc -  tmp[8];
    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
    STORE(0, 0, a + d);
    STORE(1, 0, b + c);
    STORE(2, 0, b - c);
    STORE(3, 0, a - d);
    tmp++;
    dst += BPS;
  }
 }
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
  const int a = in[0] + 4;
  const int c4 = MUL(in[4], kC2);
  const int d4 = MUL(in[4], kC1);
  const int c1 = MUL(in[1], kC2);
  const int d1 = MUL(in[1], kC1);
  STORE2(0, a + d4, d1, c1);
  STORE2(1, a + c4, d1, c1);
  STORE2(2, a - c4, d1, c1);
  STORE2(3, a - d4, d1, c1);
 }
 #undef MUL
 #undef STORE2
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
  TransformOne(in, dst);
  if (do_two) {
    TransformOne(in + 16, dst + 4);
  }
 }
 static void TransformUV(const int16_t* in, uint8_t* dst) {
  VP8Transform(in + 0 * 16, dst, 1);
  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 static void TransformDC(const int16_t* in, uint8_t* dst) {
  const int DC = in[0] + 4;
  int i, j;
  for (j = 0; j < 4; ++j) {
    for (i = 0; i < 4; ++i) {
      STORE(i, j, DC);
    }
  }
 }
 static void TransformDCUV(const int16_t* in, uint8_t* dst) {
  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
  if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
 }
 #undef STORE
 //------------------------------------------------------------------------------
 // Paragraph 14.3
 static void TransformWHT(const int16_t* in, int16_t* out) {
  int tmp[16];
  int i;
  for (i = 0; i < 4; ++i) {
    const int a0 = in[0 + i] + in[12 + i];
    const int a1 = in[4 + i] + in[ 8 + i];
    const int a2 = in[4 + i] - in[ 8 + i];
    const int a3 = in[0 + i] - in[12 + i];
    tmp[0  + i] = a0 + a1;
    tmp[8  + i] = a0 - a1;
    tmp[4  + i] = a3 + a2;
    tmp[12 + i] = a3 - a2;
  }
  for (i = 0; i < 4; ++i) {
    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
    const int a0 = dc             + tmp[3 + i * 4];
    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
    const int a3 = dc             - tmp[3 + i * 4];
    out[ 0] = (a0 + a1) >> 3;
    out[16] = (a3 + a2) >> 3;
    out[32] = (a0 - a1) >> 3;
    out[48] = (a3 - a2) >> 3;
    out += 64;
  }
 }
 void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 //------------------------------------------------------------------------------
 // Intra predictions
 #define DST(x, y) dst[(x) + (y) * BPS]
 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
  const uint8_t* top = dst - BPS;
  const uint8_t* const clip0 = VP8kclip1 - top[-1];
  int y;
  for (y = 0; y < size; ++y) {
    const uint8_t* const clip = clip0 + dst[-1];
    int x;
    for (x = 0; x < size; ++x) {
      dst[x] = clip[top[x]];
    }
    dst += BPS;
  }
 }
 static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
 static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
 //------------------------------------------------------------------------------
 // 16x16
 static void VE16(uint8_t* dst) {     // vertical
  int j;
  for (j = 0; j < 16; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 16);
  }
 }
 static void HE16(uint8_t* dst) {     // horizontal
  int j;
  for (j = 16; j > 0; --j) {
    memset(dst, dst[-1], 16);
    dst += BPS;
  }
 }
 static WEBP_INLINE void Put16(int v, uint8_t* dst) {
  int j;
  for (j = 0; j < 16; ++j) {
    memset(dst + j * BPS, v, 16);
  }
 }
 static void DC16(uint8_t* dst) {    // DC
  int DC = 16;
  int j;
  for (j = 0; j < 16; ++j) {
    DC += dst[-1 + j * BPS] + dst[j - BPS];
  }
  Put16(DC >> 5, dst);
 }
 static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
  int DC = 8;
  int j;
  for (j = 0; j < 16; ++j) {
    DC += dst[-1 + j * BPS];
  }
  Put16(DC >> 4, dst);
 }
 static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
  int DC = 8;
  int i;
  for (i = 0; i < 16; ++i) {
    DC += dst[i - BPS];
  }
  Put16(DC >> 4, dst);
 }
 static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
  Put16(0x80, dst);
 }
 VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 //------------------------------------------------------------------------------
 // 4x4
 #define AVG3(a, b, c) ((((uint32_t)a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 static void VE4(uint8_t* dst) {    // vertical
  const uint8_t* top = dst - BPS;
  const uint8_t vals[4] = {
    (uint8_t)AVG3(top[-1], top[0], top[1]),
    (uint8_t)AVG3(top[ 0], top[1], top[2]),
    (uint8_t)AVG3(top[ 1], top[2], top[3]),
    (uint8_t)AVG3(top[ 2], top[3], top[4])
  };
  int i;
  for (i = 0; i < 4; ++i) {
    memcpy(dst + i * BPS, vals, sizeof(vals));
  }
 }
 static void HE4(uint8_t* dst) {    // horizontal
  const int A = dst[-1 - BPS];
  const int B = dst[-1];
  const int C = dst[-1 + BPS];
  const int D = dst[-1 + 2 * BPS];
  const int E = dst[-1 + 3 * BPS];
  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
 }
 static void DC4(uint8_t* dst) {   // DC
  uint32_t dc = 4;
  int i;
  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
  dc >>= 3;
  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }
 static void RD4(uint8_t* dst) {   // Down-right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int L = dst[-1 + 3 * BPS];
  const int X = dst[-1 - BPS];
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  DST(0, 3)                                     = AVG3(J, K, L);
  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
                                      DST(3, 0) = AVG3(D, C, B);
 }
 static void LD4(uint8_t* dst) {   // Down-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  const int E = dst[4 - BPS];
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
  DST(0, 0)                                     = AVG3(A, B, C);
  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
                                      DST(3, 3) = AVG3(G, H, H);
 }
 static void VR4(uint8_t* dst) {   // Vertical-Right
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int X = dst[-1 - BPS];
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  DST(0, 0) = DST(1, 2) = AVG2(X, A);
  DST(1, 0) = DST(2, 2) = AVG2(A, B);
  DST(2, 0) = DST(3, 2) = AVG2(B, C);
  DST(3, 0)             = AVG2(C, D);
  DST(0, 3) =             AVG3(K, J, I);
  DST(0, 2) =             AVG3(J, I, X);
  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
  DST(3, 1) =             AVG3(B, C, D);
 }
 static void VL4(uint8_t* dst) {   // Vertical-Left
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  const int D = dst[3 - BPS];
  const int E = dst[4 - BPS];
  const int F = dst[5 - BPS];
  const int G = dst[6 - BPS];
  const int H = dst[7 - BPS];
  DST(0, 0) =             AVG2(A, B);
  DST(1, 0) = DST(0, 2) = AVG2(B, C);
  DST(2, 0) = DST(1, 2) = AVG2(C, D);
  DST(3, 0) = DST(2, 2) = AVG2(D, E);
  DST(0, 1) =             AVG3(A, B, C);
  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
              DST(3, 2) = AVG3(E, F, G);
              DST(3, 3) = AVG3(F, G, H);
 }
 static void HU4(uint8_t* dst) {   // Horizontal-Up
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int L = dst[-1 + 3 * BPS];
  DST(0, 0) =             AVG2(I, J);
  DST(2, 0) = DST(0, 1) = AVG2(J, K);
  DST(2, 1) = DST(0, 2) = AVG2(K, L);
  DST(1, 0) =             AVG3(I, J, K);
  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
  DST(3, 2) = DST(2, 2) =
    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 static void HD4(uint8_t* dst) {  // Horizontal-Down
  const int I = dst[-1 + 0 * BPS];
  const int J = dst[-1 + 1 * BPS];
  const int K = dst[-1 + 2 * BPS];
  const int L = dst[-1 + 3 * BPS];
  const int X = dst[-1 - BPS];
  const int A = dst[0 - BPS];
  const int B = dst[1 - BPS];
  const int C = dst[2 - BPS];
  DST(0, 0) = DST(2, 1) = AVG2(I, X);
  DST(0, 1) = DST(2, 2) = AVG2(J, I);
  DST(0, 2) = DST(2, 3) = AVG2(K, J);
  DST(0, 3)             = AVG2(L, K);
  DST(3, 0)             = AVG3(A, B, C);
  DST(2, 0)             = AVG3(X, A, B);
  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
  DST(1, 3)             = AVG3(L, K, J);
 }
 #undef DST
 #undef AVG3
 #undef AVG2
 VP8PredFunc VP8PredLuma4[NUM_BMODES];
 //------------------------------------------------------------------------------
 // Chroma
 static void VE8uv(uint8_t* dst) {    // vertical
  int j;
  for (j = 0; j < 8; ++j) {
    memcpy(dst + j * BPS, dst - BPS, 8);
  }
 }
 static void HE8uv(uint8_t* dst) {    // horizontal
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst, dst[-1], 8);
    dst += BPS;
  }
 }
 // helper for chroma-DC predictions
 static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
  int j;
  for (j = 0; j < 8; ++j) {
    memset(dst + j * BPS, value, 8);
  }
 }
 static void DC8uv(uint8_t* dst) {     // DC
  int dc0 = 8;
  int i;
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
  }
  Put8x8uv(dc0 >> 4, dst);
 }
 static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
    dc0 += dst[i - BPS];
  }
  Put8x8uv(dc0 >> 3, dst);
 }
 static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
  int dc0 = 4;
  int i;
  for (i = 0; i < 8; ++i) {
    dc0 += dst[-1 + i * BPS];
  }
  Put8x8uv(dc0 >> 3, dst);
 }
 static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
  Put8x8uv(0x80, dst);
 }
 VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 //------------------------------------------------------------------------------
 // Edge filtering functions
 // 4 pixels in, 2 pixels out
 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
  const int a2 = VP8ksclip2[(a + 3) >> 3];
  p[-step] = VP8kclip1[p0 + a2];
  p[    0] = VP8kclip1[q0 - a1];
 }
 // 4 pixels in, 4 pixels out
 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  const int a = 3 * (q0 - p0);
  const int a1 = VP8ksclip2[(a + 4) >> 3];
  const int a2 = VP8ksclip2[(a + 3) >> 3];
  const int a3 = (a1 + 1) >> 1;
  p[-2*step] = VP8kclip1[p1 + a3];
  p[-  step] = VP8kclip1[p0 + a2];
  p[      0] = VP8kclip1[q0 - a1];
  p[   step] = VP8kclip1[q1 - a3];
 }
 // 6 pixels in, 6 pixels out
 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
  p[-3*step] = VP8kclip1[p2 + a3];
  p[-2*step] = VP8kclip1[p1 + a2];
  p[-  step] = VP8kclip1[p0 + a1];
  p[      0] = VP8kclip1[q0 - a1];
  p[   step] = VP8kclip1[q1 - a2];
  p[ 2*step] = VP8kclip1[q2 - a3];
 }
 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
 }
 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
 }
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
                                     int step, int t, int it) {
  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
  const int p0 = p[-step], q0 = p[0];
  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
  if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
  return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
 }
 //------------------------------------------------------------------------------
 // Simple In-loop filtering (Paragraph 15.2)
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
    if (needs_filter(p + i, stride, thresh2)) {
      do_filter2(p + i, stride);
    }
  }
 }
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
  int i;
  const int thresh2 = 2 * thresh + 1;
  for (i = 0; i < 16; ++i) {
    if (needs_filter(p + i * stride, 1, thresh2)) {
      do_filter2(p + i * stride, 1);
    }
  }
 }
 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
    SimpleVFilter16(p, stride, thresh);
  }
 }
 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
    SimpleHFilter16(p, stride, thresh);
  }
 }
 //------------------------------------------------------------------------------
 // Complex In-loop filtering (Paragraph 15.3)
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
        do_filter6(p, hstride);
      }
    }
    p += vstride;
  }
 }
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                     int hstride, int vstride, int size,
                                     int thresh, int ithresh, int hev_thresh) {
  const int thresh2 = 2 * thresh + 1;
  while (size-- > 0) {
    if (needs_filter2(p, hstride, thresh2, ithresh)) {
      if (hev(p, hstride, hev_thresh)) {
        do_filter2(p, hstride);
      } else {
        do_filter4(p, hstride);
      }
    }
    p += vstride;
  }
 }
 // on macroblock edges
 static void VFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
 }
 static void HFilter16(uint8_t* p, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
 }
 // on three inner edges
 static void VFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4 * stride;
    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
  }
 }
 static void HFilter16i(uint8_t* p, int stride,
                       int thresh, int ithresh, int hev_thresh) {
  int k;
  for (k = 3; k > 0; --k) {
    p += 4;
    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
  }
 }
 // 8-pixels wide variant, for chroma filtering
 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
                     int thresh, int ithresh, int hev_thresh) {
  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
 }
 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
                      int thresh, int ithresh, int hev_thresh) {
  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
 }
 //------------------------------------------------------------------------------
 VP8DecIdct2 VP8Transform;
 VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
 VP8LumaFilterFunc VP8VFilter16;
 VP8LumaFilterFunc VP8HFilter16;
 VP8ChromaFilterFunc VP8VFilter8;
 VP8ChromaFilterFunc VP8HFilter8;
 VP8LumaFilterFunc VP8VFilter16i;
 VP8LumaFilterFunc VP8HFilter16i;
 VP8ChromaFilterFunc VP8VFilter8i;
 VP8ChromaFilterFunc VP8HFilter8i;
 VP8SimpleFilterFunc VP8SimpleVFilter16;
 VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 static volatile VP8CPUInfo dec_last_cpuinfo_used =
    (VP8CPUInfo)&dec_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
  VP8InitClipTables();
  VP8TransformWHT = TransformWHT;
  VP8Transform = TransformTwo;
  VP8TransformUV = TransformUV;
  VP8TransformDC = TransformDC;
  VP8TransformDCUV = TransformDCUV;
  VP8TransformAC3 = TransformAC3;
  VP8VFilter16 = VFilter16;
  VP8HFilter16 = HFilter16;
  VP8VFilter8 = VFilter8;
  VP8HFilter8 = HFilter8;
  VP8VFilter16i = VFilter16i;
  VP8HFilter16i = HFilter16i;
  VP8VFilter8i = VFilter8i;
  VP8HFilter8i = HFilter8i;
  VP8SimpleVFilter16 = SimpleVFilter16;
  VP8SimpleHFilter16 = SimpleHFilter16;
  VP8SimpleVFilter16i = SimpleVFilter16i;
  VP8SimpleHFilter16i = SimpleHFilter16i;
  VP8PredLuma4[0] = DC4;
  VP8PredLuma4[1] = TM4;
  VP8PredLuma4[2] = VE4;
  VP8PredLuma4[3] = HE4;
  VP8PredLuma4[4] = RD4;
  VP8PredLuma4[5] = VR4;
  VP8PredLuma4[6] = LD4;
  VP8PredLuma4[7] = VL4;
  VP8PredLuma4[8] = HD4;
  VP8PredLuma4[9] = HU4;
  VP8PredLuma16[0] = DC16;
  VP8PredLuma16[1] = TM16;
  VP8PredLuma16[2] = VE16;
  VP8PredLuma16[3] = HE16;
  VP8PredLuma16[4] = DC16NoTop;
  VP8PredLuma16[5] = DC16NoLeft;
  VP8PredLuma16[6] = DC16NoTopLeft;
  VP8PredChroma8[0] = DC8uv;
  VP8PredChroma8[1] = TM8uv;
  VP8PredChroma8[2] = VE8uv;
  VP8PredChroma8[3] = HE8uv;
  VP8PredChroma8[4] = DC8uvNoTop;
  VP8PredChroma8[5] = DC8uvNoLeft;
  VP8PredChroma8[6] = DC8uvNoTopLeft;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8DspInitSSE2();
 #if defined(WEBP_USE_SSE41)
      if (VP8GetCPUInfo(kSSE4_1)) {
        VP8DspInitSSE41();
      }
 #endif
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8DspInitNEON();
    }
 #endif
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      VP8DspInitMIPS32();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8DspInitMIPSdspR2();
    }
 #endif
  }
  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/loaders/webp/dsp/dec_clip_tables.cpp
+++ b/src/loaders/webp/dsp/dec_clip_tables.cpp
@ -0,0 +1,366 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Clipping tables for filtering
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./dsp.h"
 //#define USE_STATIC_TABLES     // undefine to have run-time table initialization
 #ifdef USE_STATIC_TABLES
 static const uint8_t abs0[255 + 255 + 1] = {
  (uint8_t)0xff, (uint8_t)0xfe, (uint8_t)0xfd, (uint8_t)0xfc, (uint8_t)0xfb, (uint8_t)0xfa, (uint8_t)0xf9, (uint8_t)0xf8, (uint8_t)0xf7, (uint8_t)0xf6, (uint8_t)0xf5, (uint8_t)0xf4,
  (uint8_t)0xf3, (uint8_t)0xf2, (uint8_t)0xf1, (uint8_t)0xf0, (uint8_t)0xef, (uint8_t)0xee, (uint8_t)0xed, (uint8_t)0xec, (uint8_t)0xeb, (uint8_t)0xea, (uint8_t)0xe9, (uint8_t)0xe8,
  (uint8_t)0xe7, (uint8_t)0xe6, (uint8_t)0xe5, (uint8_t)0xe4, (uint8_t)0xe3, (uint8_t)0xe2, (uint8_t)0xe1, (uint8_t)0xe0, (uint8_t)0xdf, (uint8_t)0xde, (uint8_t)0xdd, (uint8_t)0xdc,
  (uint8_t)0xdb, (uint8_t)0xda, (uint8_t)0xd9, (uint8_t)0xd8, (uint8_t)0xd7, (uint8_t)0xd6, (uint8_t)0xd5, (uint8_t)0xd4, (uint8_t)0xd3, (uint8_t)0xd2, (uint8_t)0xd1, (uint8_t)0xd0,
  (uint8_t)0xcf, (uint8_t)0xce, (uint8_t)0xcd, (uint8_t)0xcc, (uint8_t)0xcb, (uint8_t)0xca, (uint8_t)0xc9, (uint8_t)0xc8, (uint8_t)0xc7, (uint8_t)0xc6, (uint8_t)0xc5, (uint8_t)0xc4,
  (uint8_t)0xc3, (uint8_t)0xc2, (uint8_t)0xc1, (uint8_t)0xc0, (uint8_t)0xbf, (uint8_t)0xbe, (uint8_t)0xbd, (uint8_t)0xbc, (uint8_t)0xbb, (uint8_t)0xba, (uint8_t)0xb9, (uint8_t)0xb8,
  (uint8_t)0xb7, (uint8_t)0xb6, (uint8_t)0xb5, (uint8_t)0xb4, (uint8_t)0xb3, (uint8_t)0xb2, (uint8_t)0xb1, (uint8_t)0xb0, (uint8_t)0xaf, (uint8_t)0xae, (uint8_t)0xad, (uint8_t)0xac,
  (uint8_t)0xab, (uint8_t)0xaa, (uint8_t)0xa9, (uint8_t)0xa8, (uint8_t)0xa7, (uint8_t)0xa6, (uint8_t)0xa5, (uint8_t)0xa4, (uint8_t)0xa3, (uint8_t)0xa2, (uint8_t)0xa1, (uint8_t)0xa0,
  (uint8_t)0x9f, (uint8_t)0x9e, (uint8_t)0x9d, (uint8_t)0x9c, (uint8_t)0x9b, (uint8_t)0x9a, (uint8_t)0x99, (uint8_t)0x98, (uint8_t)0x97, (uint8_t)0x96, (uint8_t)0x95, (uint8_t)0x94,
  (uint8_t)0x93, (uint8_t)0x92, (uint8_t)0x91, (uint8_t)0x90, (uint8_t)0x8f, (uint8_t)0x8e, (uint8_t)0x8d, (uint8_t)0x8c, (uint8_t)0x8b, (uint8_t)0x8a, (uint8_t)0x89, (uint8_t)0x88,
  (uint8_t)0x87, (uint8_t)0x86, (uint8_t)0x85, (uint8_t)0x84, (uint8_t)0x83, (uint8_t)0x82, (uint8_t)0x81, (uint8_t)0x80, (uint8_t)0x7f, (uint8_t)0x7e, (uint8_t)0x7d, (uint8_t)0x7c,
  (uint8_t)0x7b, (uint8_t)0x7a, (uint8_t)0x79, (uint8_t)0x78, (uint8_t)0x77, (uint8_t)0x76, (uint8_t)0x75, (uint8_t)0x74, (uint8_t)0x73, (uint8_t)0x72, (uint8_t)0x71, (uint8_t)0x70,
  (uint8_t)0x6f, (uint8_t)0x6e, (uint8_t)0x6d, (uint8_t)0x6c, (uint8_t)0x6b, (uint8_t)0x6a, (uint8_t)0x69, (uint8_t)0x68, (uint8_t)0x67, (uint8_t)0x66, (uint8_t)0x65, (uint8_t)0x64,
  (uint8_t)0x63, (uint8_t)0x62, (uint8_t)0x61, (uint8_t)0x60, (uint8_t)0x5f, (uint8_t)0x5e, (uint8_t)0x5d, (uint8_t)0x5c, (uint8_t)0x5b, (uint8_t)0x5a, (uint8_t)0x59, (uint8_t)0x58,
  (uint8_t)0x57, (uint8_t)0x56, (uint8_t)0x55, (uint8_t)0x54, (uint8_t)0x53, (uint8_t)0x52, (uint8_t)0x51, (uint8_t)0x50, (uint8_t)0x4f, (uint8_t)0x4e, (uint8_t)0x4d, (uint8_t)0x4c,
  (uint8_t)0x4b, (uint8_t)0x4a, (uint8_t)0x49, (uint8_t)0x48, (uint8_t)0x47, (uint8_t)0x46, (uint8_t)0x45, (uint8_t)0x44, (uint8_t)0x43, (uint8_t)0x42, (uint8_t)0x41, (uint8_t)0x40,
  (uint8_t)0x3f, (uint8_t)0x3e, (uint8_t)0x3d, (uint8_t)0x3c, (uint8_t)0x3b, (uint8_t)0x3a, (uint8_t)0x39, (uint8_t)0x38, (uint8_t)0x37, (uint8_t)0x36, (uint8_t)0x35, (uint8_t)0x34,
  (uint8_t)0x33, (uint8_t)0x32, (uint8_t)0x31, (uint8_t)0x30, (uint8_t)0x2f, (uint8_t)0x2e, (uint8_t)0x2d, (uint8_t)0x2c, (uint8_t)0x2b, (uint8_t)0x2a, (uint8_t)0x29, (uint8_t)0x28,
  (uint8_t)0x27, (uint8_t)0x26, (uint8_t)0x25, (uint8_t)0x24, (uint8_t)0x23, (uint8_t)0x22, (uint8_t)0x21, (uint8_t)0x20, (uint8_t)0x1f, (uint8_t)0x1e, (uint8_t)0x1d, (uint8_t)0x1c,
  (uint8_t)0x1b, (uint8_t)0x1a, (uint8_t)0x19, (uint8_t)0x18, (uint8_t)0x17, (uint8_t)0x16, (uint8_t)0x15, (uint8_t)0x14, (uint8_t)0x13, (uint8_t)0x12, (uint8_t)0x11, (uint8_t)0x10,
  (uint8_t)0x0f, (uint8_t)0x0e, (uint8_t)0x0d, (uint8_t)0x0c, (uint8_t)0x0b, (uint8_t)0x0a, (uint8_t)0x09, (uint8_t)0x08, (uint8_t)0x07, (uint8_t)0x06, (uint8_t)0x05, (uint8_t)0x04,
  (uint8_t)0x03, (uint8_t)0x02, (uint8_t)0x01, (uint8_t)0x00, (uint8_t)0x01, (uint8_t)0x02, (uint8_t)0x03, (uint8_t)0x04, (uint8_t)0x05, (uint8_t)0x06, (uint8_t)0x07, (uint8_t)0x08,
  (uint8_t)0x09, (uint8_t)0x0a, (uint8_t)0x0b, (uint8_t)0x0c, (uint8_t)0x0d, (uint8_t)0x0e, (uint8_t)0x0f, (uint8_t)0x10, (uint8_t)0x11, (uint8_t)0x12, (uint8_t)0x13, (uint8_t)0x14,
  (uint8_t)0x15, (uint8_t)0x16, (uint8_t)0x17, (uint8_t)0x18, (uint8_t)0x19, (uint8_t)0x1a, (uint8_t)0x1b, (uint8_t)0x1c, (uint8_t)0x1d, (uint8_t)0x1e, (uint8_t)0x1f, (uint8_t)0x20,
  (uint8_t)0x21, (uint8_t)0x22, (uint8_t)0x23, (uint8_t)0x24, (uint8_t)0x25, (uint8_t)0x26, (uint8_t)0x27, (uint8_t)0x28, (uint8_t)0x29, (uint8_t)0x2a, (uint8_t)0x2b, (uint8_t)0x2c,
  (uint8_t)0x2d, (uint8_t)0x2e, (uint8_t)0x2f, (uint8_t)0x30, (uint8_t)0x31, (uint8_t)0x32, (uint8_t)0x33, (uint8_t)0x34, (uint8_t)0x35, (uint8_t)0x36, (uint8_t)0x37, (uint8_t)0x38,
  (uint8_t)0x39, (uint8_t)0x3a, (uint8_t)0x3b, (uint8_t)0x3c, (uint8_t)0x3d, (uint8_t)0x3e, (uint8_t)0x3f, (uint8_t)0x40, (uint8_t)0x41, (uint8_t)0x42, (uint8_t)0x43, (uint8_t)0x44,
  (uint8_t)0x45, (uint8_t)0x46, (uint8_t)0x47, (uint8_t)0x48, (uint8_t)0x49, (uint8_t)0x4a, (uint8_t)0x4b, (uint8_t)0x4c, (uint8_t)0x4d, (uint8_t)0x4e, (uint8_t)0x4f, (uint8_t)0x50,
  (uint8_t)0x51, (uint8_t)0x52, (uint8_t)0x53, (uint8_t)0x54, (uint8_t)0x55, (uint8_t)0x56, (uint8_t)0x57, (uint8_t)0x58, (uint8_t)0x59, (uint8_t)0x5a, (uint8_t)0x5b, (uint8_t)0x5c,
  (uint8_t)0x5d, (uint8_t)0x5e, (uint8_t)0x5f, (uint8_t)0x60, (uint8_t)0x61, (uint8_t)0x62, (uint8_t)0x63, (uint8_t)0x64, (uint8_t)0x65, (uint8_t)0x66, (uint8_t)0x67, (uint8_t)0x68,
  (uint8_t)0x69, (uint8_t)0x6a, (uint8_t)0x6b, (uint8_t)0x6c, (uint8_t)0x6d, (uint8_t)0x6e, (uint8_t)0x6f, (uint8_t)0x70, (uint8_t)0x71, (uint8_t)0x72, (uint8_t)0x73, (uint8_t)0x74,
  (uint8_t)0x75, (uint8_t)0x76, (uint8_t)0x77, (uint8_t)0x78, (uint8_t)0x79, (uint8_t)0x7a, (uint8_t)0x7b, (uint8_t)0x7c, (uint8_t)0x7d, (uint8_t)0x7e, (uint8_t)0x7f, (uint8_t)0x80,
  (uint8_t)0x81, (uint8_t)0x82, (uint8_t)0x83, (uint8_t)0x84, (uint8_t)0x85, (uint8_t)0x86, (uint8_t)0x87, (uint8_t)0x88, (uint8_t)0x89, (uint8_t)0x8a, (uint8_t)0x8b, (uint8_t)0x8c,
  (uint8_t)0x8d, (uint8_t)0x8e, (uint8_t)0x8f, (uint8_t)0x90, (uint8_t)0x91, (uint8_t)0x92, (uint8_t)0x93, (uint8_t)0x94, (uint8_t)0x95, (uint8_t)0x96, (uint8_t)0x97, (uint8_t)0x98,
  (uint8_t)0x99, (uint8_t)0x9a, (uint8_t)0x9b, (uint8_t)0x9c, (uint8_t)0x9d, (uint8_t)0x9e, (uint8_t)0x9f, (uint8_t)0xa0, (uint8_t)0xa1, (uint8_t)0xa2, (uint8_t)0xa3, (uint8_t)0xa4,
  (uint8_t)0xa5, (uint8_t)0xa6, (uint8_t)0xa7, (uint8_t)0xa8, (uint8_t)0xa9, (uint8_t)0xaa, (uint8_t)0xab, (uint8_t)0xac, (uint8_t)0xad, (uint8_t)0xae, (uint8_t)0xaf, (uint8_t)0xb0,
  (uint8_t)0xb1, (uint8_t)0xb2, (uint8_t)0xb3, (uint8_t)0xb4, (uint8_t)0xb5, (uint8_t)0xb6, (uint8_t)0xb7, (uint8_t)0xb8, (uint8_t)0xb9, (uint8_t)0xba, (uint8_t)0xbb, (uint8_t)0xbc,
  (uint8_t)0xbd, (uint8_t)0xbe, (uint8_t)0xbf, (uint8_t)0xc0, (uint8_t)0xc1, (uint8_t)0xc2, (uint8_t)0xc3, (uint8_t)0xc4, (uint8_t)0xc5, (uint8_t)0xc6, (uint8_t)0xc7, (uint8_t)0xc8,
  (uint8_t)0xc9, (uint8_t)0xca, (uint8_t)0xcb, (uint8_t)0xcc, (uint8_t)0xcd, (uint8_t)0xce, (uint8_t)0xcf, (uint8_t)0xd0, (uint8_t)0xd1, (uint8_t)0xd2, (uint8_t)0xd3, (uint8_t)0xd4,
  (uint8_t)0xd5, (uint8_t)0xd6, (uint8_t)0xd7, (uint8_t)0xd8, (uint8_t)0xd9, (uint8_t)0xda, (uint8_t)0xdb, (uint8_t)0xdc, (uint8_t)0xdd, (uint8_t)0xde, (uint8_t)0xdf, (uint8_t)0xe0,
  (uint8_t)0xe1, (uint8_t)0xe2, (uint8_t)0xe3, (uint8_t)0xe4, (uint8_t)0xe5, (uint8_t)0xe6, (uint8_t)0xe7, (uint8_t)0xe8, (uint8_t)0xe9, (uint8_t)0xea, (uint8_t)0xeb, (uint8_t)0xec,
  (uint8_t)0xed, (uint8_t)0xee, (uint8_t)0xef, (uint8_t)0xf0, (uint8_t)0xf1, (uint8_t)0xf2, (uint8_t)0xf3, (uint8_t)0xf4, (uint8_t)0xf5, (uint8_t)0xf6, (uint8_t)0xf7, (uint8_t)0xf8,
  (uint8_t)0xf9, (uint8_t)0xfa, (uint8_t)0xfb, (uint8_t)0xfc, (uint8_t)0xfd, (uint8_t)0xfe, (uint8_t)0xff
 };
 static const int8_t sclip1[1020 + 1020 + 1] = {
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x81, (int8_t)0x82, (int8_t)0x83, (int8_t)0x84, (int8_t)0x85, (int8_t)0x86, (int8_t)0x87,
  (int8_t)0x88, (int8_t)0x89, (int8_t)0x8a, (int8_t)0x8b, (int8_t)0x8c, (int8_t)0x8d, (int8_t)0x8e, (int8_t)0x8f, (int8_t)0x90, (int8_t)0x91, (int8_t)0x92, (int8_t)0x93,
  (int8_t)0x94, (int8_t)0x95, (int8_t)0x96, (int8_t)0x97, (int8_t)0x98, (int8_t)0x99, (int8_t)0x9a, (int8_t)0x9b, (int8_t)0x9c, (int8_t)0x9d, (int8_t)0x9e, (int8_t)0x9f,
  (int8_t)0xa0, (int8_t)0xa1, (int8_t)0xa2, (int8_t)0xa3, (int8_t)0xa4, (int8_t)0xa5, (int8_t)0xa6, (int8_t)0xa7, (int8_t)0xa8, (int8_t)0xa9, (int8_t)0xaa, (int8_t)0xab,
  (int8_t)0xac, (int8_t)0xad, (int8_t)0xae, (int8_t)0xaf, (int8_t)0xb0, (int8_t)0xb1, (int8_t)0xb2, (int8_t)0xb3, (int8_t)0xb4, (int8_t)0xb5, (int8_t)0xb6, (int8_t)0xb7,
  (int8_t)0xb8, (int8_t)0xb9, (int8_t)0xba, (int8_t)0xbb, (int8_t)0xbc, (int8_t)0xbd, (int8_t)0xbe, (int8_t)0xbf, (int8_t)0xc0, (int8_t)0xc1, (int8_t)0xc2, (int8_t)0xc3,
  (int8_t)0xc4, (int8_t)0xc5, (int8_t)0xc6, (int8_t)0xc7, (int8_t)0xc8, (int8_t)0xc9, (int8_t)0xca, (int8_t)0xcb, (int8_t)0xcc, (int8_t)0xcd, (int8_t)0xce, (int8_t)0xcf,
  (int8_t)0xd0, (int8_t)0xd1, (int8_t)0xd2, (int8_t)0xd3, (int8_t)0xd4, (int8_t)0xd5, (int8_t)0xd6, (int8_t)0xd7, (int8_t)0xd8, (int8_t)0xd9, (int8_t)0xda, (int8_t)0xdb,
  (int8_t)0xdc, (int8_t)0xdd, (int8_t)0xde, (int8_t)0xdf, (int8_t)0xe0, (int8_t)0xe1, (int8_t)0xe2, (int8_t)0xe3, (int8_t)0xe4, (int8_t)0xe5, (int8_t)0xe6, (int8_t)0xe7,
  (int8_t)0xe8, (int8_t)0xe9, (int8_t)0xea, (int8_t)0xeb, (int8_t)0xec, (int8_t)0xed, (int8_t)0xee, (int8_t)0xef, (int8_t)0xf0, (int8_t)0xf1, (int8_t)0xf2, (int8_t)0xf3,
  (int8_t)0xf4, (int8_t)0xf5, (int8_t)0xf6, (int8_t)0xf7, (int8_t)0xf8, (int8_t)0xf9, (int8_t)0xfa, (int8_t)0xfb, (int8_t)0xfc, (int8_t)0xfd, (int8_t)0xfe, (int8_t)0xff,
  (int8_t)0x00, (int8_t)0x01, (int8_t)0x02, (int8_t)0x03, (int8_t)0x04, (int8_t)0x05, (int8_t)0x06, (int8_t)0x07, (int8_t)0x08, (int8_t)0x09, (int8_t)0x0a, (int8_t)0x0b,
  (int8_t)0x0c, (int8_t)0x0d, (int8_t)0x0e, (int8_t)0x0f, (int8_t)0x10, (int8_t)0x11, (int8_t)0x12, (int8_t)0x13, (int8_t)0x14, (int8_t)0x15, (int8_t)0x16, (int8_t)0x17,
  (int8_t)0x18, (int8_t)0x19, (int8_t)0x1a, (int8_t)0x1b, (int8_t)0x1c, (int8_t)0x1d, (int8_t)0x1e, (int8_t)0x1f, (int8_t)0x20, (int8_t)0x21, (int8_t)0x22, (int8_t)0x23,
  (int8_t)0x24, (int8_t)0x25, (int8_t)0x26, (int8_t)0x27, (int8_t)0x28, (int8_t)0x29, (int8_t)0x2a, (int8_t)0x2b, (int8_t)0x2c, (int8_t)0x2d, (int8_t)0x2e, (int8_t)0x2f,
  (int8_t)0x30, (int8_t)0x31, (int8_t)0x32, (int8_t)0x33, (int8_t)0x34, (int8_t)0x35, (int8_t)0x36, (int8_t)0x37, (int8_t)0x38, (int8_t)0x39, (int8_t)0x3a, (int8_t)0x3b,
  (int8_t)0x3c, (int8_t)0x3d, (int8_t)0x3e, (int8_t)0x3f, (int8_t)0x40, (int8_t)0x41, (int8_t)0x42, (int8_t)0x43, (int8_t)0x44, (int8_t)0x45, (int8_t)0x46, (int8_t)0x47,
  (int8_t)0x48, (int8_t)0x49, (int8_t)0x4a, (int8_t)0x4b, (int8_t)0x4c, (int8_t)0x4d, (int8_t)0x4e, (int8_t)0x4f, (int8_t)0x50, (int8_t)0x51, (int8_t)0x52, (int8_t)0x53,
  (int8_t)0x54, (int8_t)0x55, (int8_t)0x56, (int8_t)0x57, (int8_t)0x58, (int8_t)0x59, (int8_t)0x5a, (int8_t)0x5b, (int8_t)0x5c, (int8_t)0x5d, (int8_t)0x5e, (int8_t)0x5f,
  (int8_t)0x60, (int8_t)0x61, (int8_t)0x62, (int8_t)0x63, (int8_t)0x64, (int8_t)0x65, (int8_t)0x66, (int8_t)0x67, (int8_t)0x68, (int8_t)0x69, (int8_t)0x6a, (int8_t)0x6b,
  (int8_t)0x6c, (int8_t)0x6d, (int8_t)0x6e, (int8_t)0x6f, (int8_t)0x70, (int8_t)0x71, (int8_t)0x72, (int8_t)0x73, (int8_t)0x74, (int8_t)0x75, (int8_t)0x76, (int8_t)0x77,
  (int8_t)0x78, (int8_t)0x79, (int8_t)0x7a, (int8_t)0x7b, (int8_t)0x7c, (int8_t)0x7d, (int8_t)0x7e, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f
 };
 static const int8_t sclip2[112 + 112 + 1] = {
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
  (int8_t)0xf0, (int8_t)0xf1, (int8_t)0xf2, (int8_t)0xf3, (int8_t)0xf4, (int8_t)0xf5, (int8_t)0xf6, (int8_t)0xf7, (int8_t)0xf8, (int8_t)0xf9, (int8_t)0xfa, (int8_t)0xfb,
  (int8_t)0xfc, (int8_t)0xfd, (int8_t)0xfe, (int8_t)0xff, (int8_t)0x00, (int8_t)0x01, (int8_t)0x02, (int8_t)0x03, (int8_t)0x04, (int8_t)0x05, (int8_t)0x06, (int8_t)0x07,
  (int8_t)0x08, (int8_t)0x09, (int8_t)0x0a, (int8_t)0x0b, (int8_t)0x0c, (int8_t)0x0d, (int8_t)0x0e, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f
 };
 static const uint8_t clip1[255 + 511 + 1] = {
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x01, (uint8_t)0x02, (uint8_t)0x03, (uint8_t)0x04, (uint8_t)0x05, (uint8_t)0x06, (uint8_t)0x07, (uint8_t)0x08,
  (uint8_t)0x09, (uint8_t)0x0a, (uint8_t)0x0b, (uint8_t)0x0c, (uint8_t)0x0d, (uint8_t)0x0e, (uint8_t)0x0f, (uint8_t)0x10, (uint8_t)0x11, (uint8_t)0x12, (uint8_t)0x13, (uint8_t)0x14,
  (uint8_t)0x15, (uint8_t)0x16, (uint8_t)0x17, (uint8_t)0x18, (uint8_t)0x19, (uint8_t)0x1a, (uint8_t)0x1b, (uint8_t)0x1c, (uint8_t)0x1d, (uint8_t)0x1e, (uint8_t)0x1f, (uint8_t)0x20,
  (uint8_t)0x21, (uint8_t)0x22, (uint8_t)0x23, (uint8_t)0x24, (uint8_t)0x25, (uint8_t)0x26, (uint8_t)0x27, (uint8_t)0x28, (uint8_t)0x29, (uint8_t)0x2a, (uint8_t)0x2b, (uint8_t)0x2c,
  (uint8_t)0x2d, (uint8_t)0x2e, (uint8_t)0x2f, (uint8_t)0x30, (uint8_t)0x31, (uint8_t)0x32, (uint8_t)0x33, (uint8_t)0x34, (uint8_t)0x35, (uint8_t)0x36, (uint8_t)0x37, (uint8_t)0x38,
  (uint8_t)0x39, (uint8_t)0x3a, (uint8_t)0x3b, (uint8_t)0x3c, (uint8_t)0x3d, (uint8_t)0x3e, (uint8_t)0x3f, (uint8_t)0x40, (uint8_t)0x41, (uint8_t)0x42, (uint8_t)0x43, (uint8_t)0x44,
  (uint8_t)0x45, (uint8_t)0x46, (uint8_t)0x47, (uint8_t)0x48, (uint8_t)0x49, (uint8_t)0x4a, (uint8_t)0x4b, (uint8_t)0x4c, (uint8_t)0x4d, (uint8_t)0x4e, (uint8_t)0x4f, (uint8_t)0x50,
  (uint8_t)0x51, (uint8_t)0x52, (uint8_t)0x53, (uint8_t)0x54, (uint8_t)0x55, (uint8_t)0x56, (uint8_t)0x57, (uint8_t)0x58, (uint8_t)0x59, (uint8_t)0x5a, (uint8_t)0x5b, (uint8_t)0x5c,
  (uint8_t)0x5d, (uint8_t)0x5e, (uint8_t)0x5f, (uint8_t)0x60, (uint8_t)0x61, (uint8_t)0x62, (uint8_t)0x63, (uint8_t)0x64, (uint8_t)0x65, (uint8_t)0x66, (uint8_t)0x67, (uint8_t)0x68,
  (uint8_t)0x69, (uint8_t)0x6a, (uint8_t)0x6b, (uint8_t)0x6c, (uint8_t)0x6d, (uint8_t)0x6e, (uint8_t)0x6f, (uint8_t)0x70, (uint8_t)0x71, (uint8_t)0x72, (uint8_t)0x73, (uint8_t)0x74,
  (uint8_t)0x75, (uint8_t)0x76, (uint8_t)0x77, (uint8_t)0x78, (uint8_t)0x79, (uint8_t)0x7a, (uint8_t)0x7b, (uint8_t)0x7c, (uint8_t)0x7d, (uint8_t)0x7e, (uint8_t)0x7f, (uint8_t)0x80,
  (uint8_t)0x81, (uint8_t)0x82, (uint8_t)0x83, (uint8_t)0x84, (uint8_t)0x85, (uint8_t)0x86, (uint8_t)0x87, (uint8_t)0x88, (uint8_t)0x89, (uint8_t)0x8a, (uint8_t)0x8b, (uint8_t)0x8c,
  (uint8_t)0x8d, (uint8_t)0x8e, (uint8_t)0x8f, (uint8_t)0x90, (uint8_t)0x91, (uint8_t)0x92, (uint8_t)0x93, (uint8_t)0x94, (uint8_t)0x95, (uint8_t)0x96, (uint8_t)0x97, (uint8_t)0x98,
  (uint8_t)0x99, (uint8_t)0x9a, (uint8_t)0x9b, (uint8_t)0x9c, (uint8_t)0x9d, (uint8_t)0x9e, (uint8_t)0x9f, (uint8_t)0xa0, (uint8_t)0xa1, (uint8_t)0xa2, (uint8_t)0xa3, (uint8_t)0xa4,
  (uint8_t)0xa5, (uint8_t)0xa6, (uint8_t)0xa7, (uint8_t)0xa8, (uint8_t)0xa9, (uint8_t)0xaa, (uint8_t)0xab, (uint8_t)0xac, (uint8_t)0xad, (uint8_t)0xae, (uint8_t)0xaf, (uint8_t)0xb0,
  (uint8_t)0xb1, (uint8_t)0xb2, (uint8_t)0xb3, (uint8_t)0xb4, (uint8_t)0xb5, (uint8_t)0xb6, (uint8_t)0xb7, (uint8_t)0xb8, (uint8_t)0xb9, (uint8_t)0xba, (uint8_t)0xbb, (uint8_t)0xbc,
  (uint8_t)0xbd, (uint8_t)0xbe, (uint8_t)0xbf, (uint8_t)0xc0, (uint8_t)0xc1, (uint8_t)0xc2, (uint8_t)0xc3, (uint8_t)0xc4, (uint8_t)0xc5, (uint8_t)0xc6, (uint8_t)0xc7, (uint8_t)0xc8,
  (uint8_t)0xc9, (uint8_t)0xca, (uint8_t)0xcb, (uint8_t)0xcc, (uint8_t)0xcd, (uint8_t)0xce, (uint8_t)0xcf, (uint8_t)0xd0, (uint8_t)0xd1, (uint8_t)0xd2, (uint8_t)0xd3, (uint8_t)0xd4,
  (uint8_t)0xd5, (uint8_t)0xd6, (uint8_t)0xd7, (uint8_t)0xd8, (uint8_t)0xd9, (uint8_t)0xda, (uint8_t)0xdb, (uint8_t)0xdc, (uint8_t)0xdd, (uint8_t)0xde, (uint8_t)0xdf, (uint8_t)0xe0,
  (uint8_t)0xe1, (uint8_t)0xe2, (uint8_t)0xe3, (uint8_t)0xe4, (uint8_t)0xe5, (uint8_t)0xe6, (uint8_t)0xe7, (uint8_t)0xe8, (uint8_t)0xe9, (uint8_t)0xea, (uint8_t)0xeb, (uint8_t)0xec,
  (uint8_t)0xed, (uint8_t)0xee, (uint8_t)0xef, (uint8_t)0xf0, (uint8_t)0xf1, (uint8_t)0xf2, (uint8_t)0xf3, (uint8_t)0xf4, (uint8_t)0xf5, (uint8_t)0xf6, (uint8_t)0xf7, (uint8_t)0xf8,
  (uint8_t)0xf9, (uint8_t)0xfa, (uint8_t)0xfb, (uint8_t)0xfc, (uint8_t)0xfd, (uint8_t)0xfe, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff
 };
 #else
 // uninitialized tables
 static uint8_t abs0[255 + 255 + 1];
 static int8_t sclip1[1020 + 1020 + 1];
 static int8_t sclip2[112 + 112 + 1];
 static uint8_t clip1[255 + 511 + 1];
 // We declare this variable 'volatile' to prevent instruction reordering
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 #endif
 const int8_t* const VP8ksclip1 = &sclip1[1020];
 const int8_t* const VP8ksclip2 = &sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
 #if !defined(USE_STATIC_TABLES)
  int i;
  if (!tables_ok) {
    for (i = -255; i <= 255; ++i) {
      abs0[255 + i] = (i < 0) ? -i : i;
    }
    for (i = -1020; i <= 1020; ++i) {
      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
    }
    for (i = -112; i <= 112; ++i) {
      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
    }
    for (i = -255; i <= 255 + 255; ++i) {
      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
    }
    tables_ok = 1;
  }
 #endif    // USE_STATIC_TABLES
 }
--- a/src/loaders/webp/dsp/dsp.h
+++ b/src/loaders/webp/dsp/dsp.h
@ -0,0 +1,319 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //   Speed-critical functions.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_
 #ifdef HAVE_CONFIG_H
 #include "../webp/config.h"
 #endif
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define BPS 32   // this is the common stride for enc/dec
 //------------------------------------------------------------------------------
 // CPU detection
 #if defined(__GNUC__)
 # define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
 # define LOCAL_GCC_PREREQ(maj, min) \
    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
 #else
 # define LOCAL_GCC_VERSION 0
 # define LOCAL_GCC_PREREQ(maj, min) 0
 #endif
 #ifdef __clang__
 # define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
 # define LOCAL_CLANG_PREREQ(maj, min) \
    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
 #else
 # define LOCAL_CLANG_VERSION 0
 # define LOCAL_CLANG_PREREQ(maj, min) 0
 #endif  // __clang__
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
 // set so should succeed on one of the earlier tests.
 #if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
 //#define WEBP_USE_SSE2
 #endif
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
 #if __has_feature(thread_sanitizer)
 #undef WEBP_TSAN_IGNORE_FUNCTION
 #define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
 #endif
 #endif
 typedef enum {
  kSSE2,
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
 WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
 //------------------------------------------------------------------------------
 // Init stub generator
 // Defines an init function stub to ensure each module exposes a symbol,
 // avoiding a compiler warning.
 #define WEBP_DSP_INIT_STUB(func) \
  extern void func(void); \
  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
 //------------------------------------------------------------------------------
 // Decoding
 typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
 extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
 extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
 extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
 extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
 // clipping tables (for filtering)
 extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
 extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
 extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
 extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
 // must be called first
 void VP8InitClipTables(void);
 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
 extern VP8SimpleFilterFunc VP8SimpleVFilter16;
 extern VP8SimpleFilterFunc VP8SimpleHFilter16;
 extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
 extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
 // regular filter (on both macroblock edges and inner edges)
 typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
                                  int thresh, int ithresh, int hev_t);
 typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
                                    int thresh, int ithresh, int hev_t);
 // on outer edge
 extern VP8LumaFilterFunc VP8VFilter16;
 extern VP8LumaFilterFunc VP8HFilter16;
 extern VP8ChromaFilterFunc VP8VFilter8;
 extern VP8ChromaFilterFunc VP8HFilter8;
 // on inner edge
 extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
 extern VP8LumaFilterFunc VP8HFilter16i;
 extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
 extern VP8ChromaFilterFunc VP8HFilter8i;
 // must be called before anything using the above
 void VP8DspInit(void);
 //------------------------------------------------------------------------------
 // WebP I/O
 //#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
 // Convert a pair of y/u/v lines together to the output rgb/a colorspace.
 // bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
    const uint8_t* top_y, const uint8_t* bottom_y,
    const uint8_t* top_u, const uint8_t* top_v,
    const uint8_t* cur_u, const uint8_t* cur_v,
    uint8_t* top_dst, uint8_t* bottom_dst, int len);
 #ifdef FANCY_UPSAMPLING
 // Fancy upsampling functions to convert YUV to RGB(A) modes
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 #endif    // FANCY_UPSAMPLING
 // Per-row point-sampling methods.
 typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
                                   const uint8_t* u, const uint8_t* v,
                                   uint8_t* dst, int len);
 // Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
 void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
                             const uint8_t* u, const uint8_t* v, int uv_stride,
                             uint8_t* dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func);
 // Sampling functions to convert rows of YUV to RGB(A)
 extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
 // General function for converting two lines of ARGB or RGBA.
 // 'alpha_is_last' should be true if 0xff000000 is stored in memory as
 // as 0x00, 0x00, 0x00, 0xff (little endian).
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
 // YUV444->RGB converters
 typedef void (*WebPYUV444Converter)(const uint8_t* y,
                                    const uint8_t* u, const uint8_t* v,
                                    uint8_t* dst, int len);
 extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 // Must be called before using the WebPUpsamplers[] (and for premultiplied
 // colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
 // Must be called before using WebPSamplers[]
 void WebPInitSamplers(void);
 // Must be called before using WebPYUV444Converters[]
 void WebPInitYUV444Converters(void);
 //------------------------------------------------------------------------------
 // Rescaler
 struct WebPRescaler;
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported.
 extern void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk,
                                     const uint8_t* const src, int channel);
 // Export one row (starting at x_out position) from rescaler.
 extern void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out);
 // Plain-C implementation, as fall-back.
 extern void WebPRescalerExportRowC(struct WebPRescaler* const wrk, int x_out);
 // Must be called first before using the above.
 void WebPRescalerDspInit(void);
 //------------------------------------------------------------------------------
 // Utilities for processing transparent channel.
 // Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
 // alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
 extern void (*WebPApplyAlphaMultiply)(
    uint8_t* rgba, int alpha_first, int w, int h, int stride);
 // Same, buf specifically for RGBA4444 format
 extern void (*WebPApplyAlphaMultiply4444)(
    uint8_t* rgba4444, int w, int h, int stride);
 // Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
 // Returns true if alpha[] plane has non-trivial values different from 0xff.
 extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
                                int width, int height,
                                uint8_t* dst, int dst_stride);
 // Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
 // A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
 extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
                                        int width, int height,
                                        uint32_t* dst, int dst_stride);
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
 extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
                               int width, int height,
                               uint8_t* alpha, int alpha_stride);
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 // Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
 extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
 // Same a WebPMultARGBRow(), but for several rows.
 void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
                      int inverse);
 // Same for a row of single values, with side alpha values.
 extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
                           int width, int inverse);
 // Same a WebPMultRow(), but for several 'num_rows' rows.
 void WebPMultRows(uint8_t* ptr, int stride,
                  const uint8_t* alpha, int alpha_stride,
                  int width, int num_rows, int inverse);
 // Plain-C versions, used as fallback by some implementations.
 void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
                  int width, int inverse);
 void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
 // ARGB packing function: a/r/g/b input is rgba or bgra order.
 extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
                           const uint8_t* g, const uint8_t* b, int len,
                           uint32_t* out);
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
 extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                          int len, int step, uint32_t* out);
 //------------------------------------------------------------------------------
 // Filter functions
 typedef enum {     // Filter types.
  WEBP_FILTER_NONE = 0,
  WEBP_FILTER_HORIZONTAL,
  WEBP_FILTER_VERTICAL,
  WEBP_FILTER_GRADIENT,
  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
  WEBP_FILTER_BEST,    // meta-types
  WEBP_FILTER_FAST
 } WEBP_FILTER_TYPE;
 typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
                               int stride, uint8_t* out);
 typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
                                 int row, int num_rows, uint8_t* data);
 // Filter the given data using the given predictor.
 // 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
 // in raster order.
 // 'stride' is number of bytes per scan line (with possible padding).
 // 'out' should be pre-allocated.
 extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 // In-place reconstruct the original data from the given filtered data.
 // The reconstruction will be done for 'num_rows' rows starting from 'row'
 // (assuming rows upto 'row - 1' are already reconstructed).
 extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 // To be called first before using the above.
 void VP8FiltersInit(void);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DSP_DSP_H_ */
--- a/src/loaders/webp/dsp/filters.cpp
+++ b/src/loaders/webp/dsp/filters.cpp
@ -0,0 +1,240 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Spatial prediction using various filters
 //
 // Author: Urvang (urvang@google.com)
 #include "./dsp.h"
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 //------------------------------------------------------------------------------
 // Helpful macro.
 # define SANITY_CHECK(in, out)                                                 \
  assert(in != NULL);                                                          \
  assert(out != NULL);                                                         \
  assert(width > 0);                                                           \
  assert(height > 0);                                                          \
  assert(stride >= width);                                                     \
  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
  (void)height;  // Silence unused warning.
 static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
                                    uint8_t* dst, int length, int inverse) {
  int i;
  if (inverse) {
    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
  } else {
    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
  }
 }
 //------------------------------------------------------------------------------
 // Horizontal filter.
 static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
                                           int width, int height, int stride,
                                           int row, int num_rows,
                                           int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
  in += start_offset;
  out += start_offset;
  preds = inverse ? out : in;
  if (row == 0) {
    // Leftmost pixel is the same as input for topmost scanline.
    out[0] = in[0];
    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    preds += stride;
    in += stride;
    out += stride;
  }
  // Filter line-by-line.
  while (row < last_row) {
    // Leftmost pixel is predicted from above.
    PredictLine(in, preds - stride, out, 1, inverse);
    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    ++row;
    preds += stride;
    in += stride;
    out += stride;
  }
 }
 //------------------------------------------------------------------------------
 // Vertical filter.
 static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
                                         int width, int height, int stride,
                                         int row, int num_rows,
                                         int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
  in += start_offset;
  out += start_offset;
  preds = inverse ? out : in;
  if (row == 0) {
    // Very first top-left pixel is copied.
    out[0] = in[0];
    // Rest of top scan-line is left-predicted.
    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    in += stride;
    out += stride;
  } else {
    // We are starting from in-between. Make sure 'preds' points to prev row.
    preds -= stride;
  }
  // Filter line-by-line.
  while (row < last_row) {
    PredictLine(in, preds, out, width, inverse);
    ++row;
    preds += stride;
    in += stride;
    out += stride;
  }
 }
 //------------------------------------------------------------------------------
 // Gradient filter.
 static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
  const int g = a + b - c;
  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
                                         int width, int height, int stride,
                                         int row, int num_rows,
                                         int inverse, uint8_t* out) {
  const uint8_t* preds;
  const size_t start_offset = row * stride;
  const int last_row = row + num_rows;
  SANITY_CHECK(in, out);
  in += start_offset;
  out += start_offset;
  preds = inverse ? out : in;
  // left prediction for top scan-line
  if (row == 0) {
    out[0] = in[0];
    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
    row = 1;
    preds += stride;
    in += stride;
    out += stride;
  }
  // Filter line-by-line.
  while (row < last_row) {
    int w;
    // leftmost pixel: predict from above.
    PredictLine(in, preds - stride, out, 1, inverse);
    for (w = 1; w < width; ++w) {
      const int pred = GradientPredictor(preds[w - 1],
                                         preds[w - stride],
                                         preds[w - stride - 1]);
      out[w] = in[w] + (inverse ? pred : -pred);
    }
    ++row;
    preds += stride;
    in += stride;
    out += stride;
  }
 }
 #undef SANITY_CHECK
 //------------------------------------------------------------------------------
 static void HorizontalFilter(const uint8_t* data, int width, int height,
                             int stride, uint8_t* filtered_data) {
  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 static void VerticalFilter(const uint8_t* data, int width, int height,
                           int stride, uint8_t* filtered_data) {
  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 static void GradientFilter(const uint8_t* data, int width, int height,
                           int stride, uint8_t* filtered_data) {
  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 //------------------------------------------------------------------------------
 static void VerticalUnfilter(int width, int height, int stride, int row,
                             int num_rows, uint8_t* data) {
  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 static void HorizontalUnfilter(int width, int height, int stride, int row,
                               int num_rows, uint8_t* data) {
  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 static void GradientUnfilter(int width, int height, int stride, int row,
                             int num_rows, uint8_t* data) {
  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 //------------------------------------------------------------------------------
 // Init function
 WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 extern void VP8FiltersInitMIPSdspR2(void);
 extern void VP8FiltersInitSSE2(void);
 static volatile VP8CPUInfo filters_last_cpuinfo_used =
    (VP8CPUInfo)&filters_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
  WebPFilters[WEBP_FILTER_NONE] = NULL;
  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8FiltersInitSSE2();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8FiltersInitMIPSdspR2();
    }
 #endif
  }
  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/loaders/webp/dsp/lossless.cpp
+++ b/src/loaders/webp/dsp/lossless.cpp
@ -0,0 +1,636 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
 //
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 #include "./dsp.h"
 #include <math.h>
 #include <stdlib.h>
 #include "../dec/vp8li.h"
 #include "../utils/endian_inl.h"
 #include "./lossless.h"
 #include "./yuv.h"
 #define MAX_DIFF_COST (1e30f)
 //------------------------------------------------------------------------------
 // Image transforms.
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
 }
 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  return Average2(Average2(a0, a2), a1);
 }
 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
                                     uint32_t a2, uint32_t a3) {
  return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 static WEBP_INLINE uint32_t Clip255(uint32_t a) {
  if (a < 256) {
    return a;
  }
  // return 0, when a is a negative integer.
  // return 255, when a is positive.
  return ~a >> 24;
 }
 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
  return Clip255(a + b - c);
 }
 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
                                         (c1 >> 16) & 0xff,
                                         (c2 >> 16) & 0xff);
  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
                                         (c1 >> 8) & 0xff,
                                         (c2 >> 8) & 0xff);
  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
  return Clip255(a + (a - b) / 2);
 }
 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const uint32_t ave = Average2(c0, c1);
  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
 }
 // gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
 #if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
 # define LOCAL_INLINE __attribute__ ((noinline))
 #else
 # define LOCAL_INLINE WEBP_INLINE
 #endif
 static LOCAL_INLINE int Sub3(int a, int b, int c) {
  const int pb = b - c;
  const int pa = a - c;
  return abs(pb) - abs(pa);
 }
 #undef LOCAL_INLINE
 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
  const int pa_minus_pb =
      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
  return (pa_minus_pb <= 0) ? a : b;
 }
 //------------------------------------------------------------------------------
 // Predictors
 static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
  (void)top;
  (void)left;
  return ARGB_BLACK;
 }
 static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
  (void)top;
  return left;
 }
 static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[0];
 }
 static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[1];
 }
 static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[-1];
 }
 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average3(left, top[0], top[1]);
  return pred;
 }
 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[-1]);
  return pred;
 }
 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[0]);
  return pred;
 }
 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[-1], top[0]);
  (void)left;
  return pred;
 }
 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[0], top[1]);
  (void)left;
  return pred;
 }
 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Select(top[0], left, top[-1]);
  return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  return pred;
 }
 //------------------------------------------------------------------------------
 // Inverse prediction.
 static void PredictorInverseTransform(const VP8LTransform* const transform,
                                      int y_start, int y_end, uint32_t* data) {
  const int width = transform->xsize_;
  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
    int x;
    const uint32_t pred0 = Predictor0(data[-1], NULL);
    AddPixelsEq(data, pred0);
    for (x = 1; x < width; ++x) {
      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
      AddPixelsEq(data + x, pred1);
    }
    data += width;
    ++y_start;
  }
  {
    int y = y_start;
    const int tile_width = 1 << transform->bits_;
    const int mask = tile_width - 1;
    const int safe_width = width & ~mask;
    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
    const uint32_t* pred_mode_base =
        transform->data_ + (y >> transform->bits_) * tiles_per_row;
    while (y < y_end) {
      const uint32_t pred2 = Predictor2(data[-1], data - width);
      const uint32_t* pred_mode_src = pred_mode_base;
      VP8LPredictorFunc pred_func;
      int x = 1;
      int t = 1;
      // First pixel follows the T (mode=2) mode.
      AddPixelsEq(data, pred2);
      // .. the rest:
      while (x < safe_width) {
        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
        for (; t < tile_width; ++t, ++x) {
          const uint32_t pred = pred_func(data[x - 1], data + x - width);
          AddPixelsEq(data + x, pred);
        }
        t = 0;
      }
      if (x < width) {
        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
        for (; x < width; ++x) {
          const uint32_t pred = pred_func(data[x - 1], data + x - width);
          AddPixelsEq(data + x, pred);
        }
      }
      data += width;
      ++y;
      if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
        pred_mode_base += tiles_per_row;
      }
    }
  }
 }
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
  int i;
  for (i = 0; i < num_pixels; ++i) {
    const uint32_t argb = data[i];
    const uint32_t green = ((argb >> 8) & 0xff);
    uint32_t red_blue = (argb & 0x00ff00ffu);
    red_blue += (green << 16) | green;
    red_blue &= 0x00ff00ffu;
    data[i] = (argb & 0xff00ff00u) | red_blue;
  }
 }
 static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
                                                int8_t color) {
  return (uint32_t)((int)(color_pred) * color) >> 5;
 }
 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
                                               VP8LMultipliers* const m) {
  m->green_to_red_  = (color_code >>  0) & 0xff;
  m->green_to_blue_ = (color_code >>  8) & 0xff;
  m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
                                 int num_pixels) {
  int i;
  for (i = 0; i < num_pixels; ++i) {
    const uint32_t argb = data[i];
    const uint32_t green = argb >> 8;
    const uint32_t red = argb >> 16;
    uint32_t new_red = red;
    uint32_t new_blue = argb;
    new_red += ColorTransformDelta(m->green_to_red_, green);
    new_red &= 0xff;
    new_blue += ColorTransformDelta(m->green_to_blue_, green);
    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
    new_blue &= 0xff;
    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
  }
 }
 // Color space inverse transform.
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
                                       int y_start, int y_end, uint32_t* data) {
  const int width = transform->xsize_;
  const int tile_width = 1 << transform->bits_;
  const int mask = tile_width - 1;
  const int safe_width = width & ~mask;
  const int remaining_width = width - safe_width;
  const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
  int y = y_start;
  const uint32_t* pred_row =
      transform->data_ + (y >> transform->bits_) * tiles_per_row;
  while (y < y_end) {
    const uint32_t* pred = pred_row;
    VP8LMultipliers m = { 0, 0, 0 };
    const uint32_t* const data_safe_end = data + safe_width;
    const uint32_t* const data_end = data + width;
    while (data < data_safe_end) {
      ColorCodeToMultipliers(*pred++, &m);
      VP8LTransformColorInverse(&m, data, tile_width);
      data += tile_width;
    }
    if (data < data_end) {  // Left-overs using C-version.
      ColorCodeToMultipliers(*pred++, &m);
      VP8LTransformColorInverse(&m, data, remaining_width);
      data += remaining_width;
    }
    ++y;
    if ((y & mask) == 0) pred_row += tiles_per_row;
  }
 }
 // Separate out pixels packed together using pixel-bundling.
 // We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
 #define COLOR_INDEX_INVERSE(FUNC_NAME, F_NAME, STATIC_DECL, TYPE, BIT_SUFFIX,  \
                            GET_INDEX, GET_VALUE)                              \
 static void F_NAME(const TYPE* src, const uint32_t* const color_map,           \
                   TYPE* dst, int y_start, int y_end, int width) {             \
  int y;                                                                       \
  for (y = y_start; y < y_end; ++y) {                                          \
    int x;                                                                     \
    for (x = 0; x < width; ++x) {                                              \
      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
    }                                                                          \
  }                                                                            \
 }                                                                              \
 STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
                           int y_start, int y_end, const TYPE* src,            \
                           TYPE* dst) {                                        \
  int y;                                                                       \
  const int bits_per_pixel = 8 >> transform->bits_;                            \
  const int width = transform->xsize_;                                         \
  const uint32_t* const color_map = transform->data_;                          \
  if (bits_per_pixel < 8) {                                                    \
    const int pixels_per_byte = 1 << transform->bits_;                         \
    const int count_mask = pixels_per_byte - 1;                                \
    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
    for (y = y_start; y < y_end; ++y) {                                        \
      uint32_t packed_pixels = 0;                                              \
      int x;                                                                   \
      for (x = 0; x < width; ++x) {                                            \
        /* We need to load fresh 'packed_pixels' once every                */  \
        /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
        /* is a power of 2, so can just use a mask for that, instead of    */  \
        /* decrementing a counter.                                         */  \
        if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
        *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
        packed_pixels >>= bits_per_pixel;                                      \
      }                                                                        \
    }                                                                          \
  } else {                                                                     \
    VP8LMapColor##BIT_SUFFIX(src, color_map, dst, y_start, y_end, width);      \
  }                                                                            \
 }
 COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
                    VP8GetARGBIndex, VP8GetARGBValue)
 COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
 #undef COLOR_INDEX_INVERSE
 void VP8LInverseTransform(const VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out) {
  const int width = transform->xsize_;
  assert(row_start < row_end);
  assert(row_end <= transform->ysize_);
  switch (transform->type_) {
    case SUBTRACT_GREEN:
      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
      break;
    case PREDICTOR_TRANSFORM:
      PredictorInverseTransform(transform, row_start, row_end, out);
      if (row_end != transform->ysize_) {
        // The last predicted row in this iteration will be the top-pred row
        // for the first row in next iteration.
        memcpy(out - width, out + (row_end - row_start - 1) * width,
               width * sizeof(*out));
      }
      break;
    case CROSS_COLOR_TRANSFORM:
      ColorSpaceInverseTransform(transform, row_start, row_end, out);
      break;
    case COLOR_INDEXING_TRANSFORM:
      if (in == out && transform->bits_ > 0) {
        // Move packed pixels to the end of unpacked region, so that unpacking
        // can occur seamlessly.
        // Also, note that this is the only transform that applies on
        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
        // transforms work on effective width of xsize_.
        const int out_stride = (row_end - row_start) * width;
        const int in_stride = (row_end - row_start) *
            VP8LSubSampleSize(transform->xsize_, transform->bits_);
        uint32_t* const src = out + out_stride - in_stride;
        memmove(src, out, in_stride * sizeof(*src));
        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
      } else {
        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
      }
      break;
  }
 }
 //------------------------------------------------------------------------------
 // Color space conversion.
 static int is_big_endian(void) {
  static const union {
    uint16_t w;
    uint8_t b[2];
  } tmp = { 1 };
  return (tmp.b[0] != 1);
 }
 void VP8LConvertBGRAToRGB_C(const uint32_t* src,
                            int num_pixels, uint8_t* dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
    *dst++ = (argb >> 16) & 0xff;
    *dst++ = (argb >>  8) & 0xff;
    *dst++ = (argb >>  0) & 0xff;
  }
 }
 void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
    *dst++ = (argb >> 16) & 0xff;
    *dst++ = (argb >>  8) & 0xff;
    *dst++ = (argb >>  0) & 0xff;
    *dst++ = (argb >> 24) & 0xff;
  }
 }
 void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
                                 int num_pixels, uint8_t* dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
 #ifdef WEBP_SWAP_16BIT_CSP
    *dst++ = ba;
    *dst++ = rg;
 #else
    *dst++ = rg;
    *dst++ = ba;
 #endif
  }
 }
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                               int num_pixels, uint8_t* dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
 #ifdef WEBP_SWAP_16BIT_CSP
    *dst++ = gb;
    *dst++ = rg;
 #else
    *dst++ = rg;
    *dst++ = gb;
 #endif
  }
 }
 void VP8LConvertBGRAToBGR_C(const uint32_t* src,
                            int num_pixels, uint8_t* dst) {
  const uint32_t* const src_end = src + num_pixels;
  while (src < src_end) {
    const uint32_t argb = *src++;
    *dst++ = (argb >>  0) & 0xff;
    *dst++ = (argb >>  8) & 0xff;
    *dst++ = (argb >> 16) & 0xff;
  }
 }
 static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
                       int swap_on_big_endian) {
  if (is_big_endian() == swap_on_big_endian) {
    const uint32_t* const src_end = src + num_pixels;
    while (src < src_end) {
      const uint32_t argb = *src++;
 #if !defined(WORDS_BIGENDIAN)
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
      *(uint32_t*)dst = BSwap32(argb);
 #else  // WEBP_REFERENCE_IMPLEMENTATION
      dst[0] = (argb >> 24) & 0xff;
      dst[1] = (argb >> 16) & 0xff;
      dst[2] = (argb >>  8) & 0xff;
      dst[3] = (argb >>  0) & 0xff;
 #endif
 #else  // WORDS_BIGENDIAN
      dst[0] = (argb >>  0) & 0xff;
      dst[1] = (argb >>  8) & 0xff;
      dst[2] = (argb >> 16) & 0xff;
      dst[3] = (argb >> 24) & 0xff;
 #endif
      dst += sizeof(argb);
    }
  } else {
    memcpy(dst, src, num_pixels * sizeof(*src));
  }
 }
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
  switch (out_colorspace) {
    case MODE_RGB:
      VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
      break;
    case MODE_RGBA:
      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
      break;
    case MODE_rgbA:
      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
      break;
    case MODE_BGR:
      VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
      break;
    case MODE_BGRA:
      CopyOrSwap(in_data, num_pixels, rgba, 1);
      break;
    case MODE_bgrA:
      CopyOrSwap(in_data, num_pixels, rgba, 1);
      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
      break;
    case MODE_ARGB:
      CopyOrSwap(in_data, num_pixels, rgba, 0);
      break;
    case MODE_Argb:
      CopyOrSwap(in_data, num_pixels, rgba, 0);
      WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
      break;
    case MODE_RGBA_4444:
      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
      break;
    case MODE_rgbA_4444:
      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
      WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
      break;
    case MODE_RGB_565:
      VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
      break;
    default:
      assert(0);          // Code flow should not reach here.
  }
 }
 //------------------------------------------------------------------------------
 VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 VP8LPredictorFunc VP8LPredictors[16];
 VP8LTransformColorFunc VP8LTransformColorInverse;
 VP8LConvertFunc VP8LConvertBGRAToRGB;
 VP8LConvertFunc VP8LConvertBGRAToRGBA;
 VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;
 VP8LMapARGBFunc VP8LMapColor32b;
 VP8LMapAlphaFunc VP8LMapColor8b;
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
    (VP8CPUInfo)&lossless_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
  VP8LPredictors[0] = Predictor0;
  VP8LPredictors[1] = Predictor1;
  VP8LPredictors[2] = Predictor2;
  VP8LPredictors[3] = Predictor3;
  VP8LPredictors[4] = Predictor4;
  VP8LPredictors[5] = Predictor5;
  VP8LPredictors[6] = Predictor6;
  VP8LPredictors[7] = Predictor7;
  VP8LPredictors[8] = Predictor8;
  VP8LPredictors[9] = Predictor9;
  VP8LPredictors[10] = Predictor10;
  VP8LPredictors[11] = Predictor11;
  VP8LPredictors[12] = Predictor12;
  VP8LPredictors[13] = Predictor13;
  VP8LPredictors[14] = Predictor0;     // <- padding security sentinels
  VP8LPredictors[15] = Predictor0;
  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
  VP8LTransformColorInverse = VP8LTransformColorInverse_C;
  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
  VP8LMapColor32b = MapARGB;
  VP8LMapColor8b = MapAlpha;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      VP8LDspInitSSE2();
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      VP8LDspInitNEON();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      VP8LDspInitMIPSdspR2();
    }
 #endif
  }
  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dsp/lossless.h
+++ b/src/loaders/webp/dsp/lossless.h
@ -0,0 +1,303 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Image transforms and color space conversion methods for lossless decoder.
 //
 // Authors: Vikas Arora (vikaas.arora@gmail.com)
 //          Jyrki Alakuijala (jyrki@google.com)
 #ifndef WEBP_DSP_LOSSLESS_H_
 #define WEBP_DSP_LOSSLESS_H_
 #include "../webp/types.h"
 #include "../webp/decode.h"
 // #include "../enc/histogram.h"
 #include "../utils/utils.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Not a trivial literal symbol.
 #define VP8L_NON_TRIVIAL_SYM (0xffffffff)
 //------------------------------------------------------------------------------
 // Decoding
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
 extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 typedef struct {
  // Note: the members are uint8_t, so that any negative values are
  // automatically converted to "mod 256" values.
  uint8_t green_to_red_;
  uint8_t green_to_blue_;
  uint8_t red_to_blue_;
 } VP8LMultipliers;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
                                       uint32_t* argb_data, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColorInverse;
 struct VP8LTransform;  // Defined in dec/vp8li.h.
 // Performs inverse transform of data given transform information, start and end
 // rows. Transform will be applied to rows [row_start, row_end[.
 // The *in and *out pointers refer to source and destination data respectively
 // corresponding to the intermediate row (row_start).
 void VP8LInverseTransform(const struct VP8LTransform* const transform,
                          int row_start, int row_end,
                          const uint32_t* const in, uint32_t* const out);
 // Color space conversion.
 typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
                                uint8_t* dst);
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
 extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 // Converts from BGRA to other color spaces.
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
 // color mapping related functions.
 static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
  return (idx >> 8) & 0xff;
 }
 static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
  return idx;
 }
 static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
  return val;
 }
 static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
  return (val >> 8) & 0xff;
 }
 typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
                                const uint32_t* const color_map,
                                uint32_t* dst, int y_start,
                                int y_end, int width);
 typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
                                 const uint32_t* const color_map,
                                 uint8_t* dst, int y_start,
                                 int y_end, int width);
 extern VP8LMapARGBFunc VP8LMapColor32b;
 extern VP8LMapAlphaFunc VP8LMapColor8b;
 // Similar to the static method ColorIndexInverseTransform() that is part of
 // lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
 // uint32_t) arguments for 'src' and 'dst'.
 void VP8LColorIndexInverseTransformAlpha(
    const struct VP8LTransform* const transform, int y_start, int y_end,
    const uint8_t* src, uint8_t* dst);
 // Expose some C-only fallback functions
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                 uint32_t* data, int num_pixels);
 void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
                                 int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                               int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
 //------------------------------------------------------------------------------
 // Encoding
 extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
    const uint32_t* argb, int stride,
    int tile_width, int tile_height,
    int green_to_blue, int red_to_blue, int histo[]);
 extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
 typedef void (*VP8LCollectColorRedTransformsFunc)(
    const uint32_t* argb, int stride,
    int tile_width, int tile_height,
    int green_to_red, int histo[]);
 extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 // Expose some C-only fallback functions
 void VP8LTransformColor_C(const VP8LMultipliers* const m,
                          uint32_t* data, int num_pixels);
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
 //------------------------------------------------------------------------------
 // Image transforms.
 void VP8LResidualImage(int width, int height, int bits, int low_effort,
                       uint32_t* const argb, uint32_t* const argb_scratch,
                       uint32_t* const image);
 void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                             uint32_t* const argb, uint32_t* image);
 //------------------------------------------------------------------------------
 // Misc methods.
 // Computes sampled size of 'size' when sampling using 'sampling bits'.
 static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
                                              uint32_t sampling_bits) {
  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 // -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
 typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
 extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
 extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
 static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
 // Fast calculation of v * log2(v) for integer input.
 static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
 typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
 typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
                                       int length);
 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
 typedef struct {        // small struct to hold counters
  int counts[2];        // index: 0=zero steak, 1=non-zero streak
  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
                                         int length);
 typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
                                                 const uint32_t* Y, int length);
 extern VP8LCostCountFunc VP8LHuffmanCostCount;
 extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
 // Get the symbol entropy for the distribution 'population'.
 // Set 'trivial_sym', if there's only one symbol present in the distribution.
 double VP8LPopulationCost(const uint32_t* const population, int length,
                          uint32_t* const trivial_sym);
 // Get the combined symbol entropy for the distributions 'X' and 'Y'.
 double VP8LGetCombinedEntropy(const uint32_t* const X,
                              const uint32_t* const Y, int length);
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
  const int log_floor = BitsLog2Floor(n);
  if (n == (n & ~(n - 1)))  // zero or a power of two.
    return log_floor;
  else
    return log_floor + 1;
 }
 // Splitting of distance and length codes into prefixes and
 // extra bits. The prefixes are encoded with an entropy code
 // while the extra bits are stored just as normal bits.
 static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
                                                  int* const extra_bits) {
  const int highest_bit = BitsLog2Floor(--distance);
  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
  *extra_bits = highest_bit - 1;
  *code = 2 * highest_bit + second_highest_bit;
 }
 static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
                                              int* const extra_bits,
                                              int* const extra_bits_value) {
  const int highest_bit = BitsLog2Floor(--distance);
  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
  *extra_bits = highest_bit - 1;
  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
  *code = 2 * highest_bit + second_highest_bit;
 }
 #define PREFIX_LOOKUP_IDX_MAX   512
 typedef struct {
  int8_t code_;
  int8_t extra_bits_;
 } VP8LPrefixCode;
 // These tables are derived using VP8LPrefixEncodeNoLUT.
 extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
 extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
 static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
                                             int* const extra_bits) {
  if (distance < PREFIX_LOOKUP_IDX_MAX) {
    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
    *code = prefix_code.code_;
    *extra_bits = prefix_code.extra_bits_;
  } else {
    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
  }
 }
 static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
                                         int* const extra_bits,
                                         int* const extra_bits_value) {
  if (distance < PREFIX_LOOKUP_IDX_MAX) {
    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
    *code = prefix_code.code_;
    *extra_bits = prefix_code.extra_bits_;
    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
  } else {
    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
  }
 }
 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
  const uint32_t alpha_and_green =
      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
  const uint32_t red_and_blue =
      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 }
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                        int xbits, uint32_t* const dst);
 // Must be called before calling any of the above methods.
 void VP8LEncDspInit(void);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_DSP_LOSSLESS_H_
--- a/src/loaders/webp/dsp/meson.build
+++ b/src/loaders/webp/dsp/meson.build
@ -0,0 +1,20 @@
 source_file = [
   'yuv.h',
   'dsp.h',
   'lossless.h',
   'alpha_processing.cpp',
   'argb.cpp',
   'cpu.cpp',
   'dec.cpp',
   'dec_clip_tables.cpp',
   'filters.cpp',
   'lossless.cpp',
   'rescaler.cpp',
   'upsampling.cpp',
   'yuv.cpp'
 ]
 webp_deb += [declare_dependency(
   include_directories : include_directories('.'),
   sources : source_file
 )]
--- a/src/loaders/webp/dsp/rescaler.cpp
+++ b/src/loaders/webp/dsp/rescaler.cpp
@ -0,0 +1,115 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
 #include "./dsp.h"
 #include "../utils/rescaler.h"
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
 #define ROUNDER (1 << (WEBP_RESCALER_RFIX - 1))
 #define MULT_FIX(x, y) (((int64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
 static void RescalerImportRowC(WebPRescaler* const wrk,
                               const uint8_t* const src, int channel) {
  const int x_stride = wrk->num_channels;
  const int x_out_max = wrk->dst_width * wrk->num_channels;
  int x_in = channel;
  int x_out;
  int accum = 0;
  if (!wrk->x_expand) {
    int sum = 0;
    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
      accum += wrk->x_add;
      for (; accum > 0; accum -= wrk->x_sub) {
        sum += src[x_in];
        x_in += x_stride;
      }
      {        // Emit next horizontal pixel.
        const int32_t base = src[x_in];
        const int32_t frac = base * (-accum);
        x_in += x_stride;
        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
        // fresh fractional start for next pixel
        sum = (int)MULT_FIX(frac, wrk->fx_scale);
      }
    }
  } else {        // simple bilinear interpolation
    int left = src[channel], right = src[channel];
    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
      if (accum < 0) {
        left = right;
        x_in += x_stride;
        right = src[x_in];
        accum += wrk->x_add;
      }
      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
      accum -= wrk->x_sub;
    }
  }
  // Accumulate the contribution of the new row.
  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
    wrk->irow[x_out] += wrk->frow[x_out];
  }
 }
 void WebPRescalerExportRowC(WebPRescaler* const wrk, int x_out) {
  if (wrk->y_accum <= 0) {
    uint8_t* const dst = wrk->dst;
    int32_t* const irow = wrk->irow;
    const int32_t* const frow = wrk->frow;
    const int yscale = wrk->fy_scale * (-wrk->y_accum);
    const int x_out_max = wrk->dst_width * wrk->num_channels;
    for (; x_out < x_out_max; ++x_out) {
      const int frac = (int)MULT_FIX(frow[x_out], yscale);
      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
      irow[x_out] = frac;   // new fractional start
    }
    wrk->y_accum += wrk->y_add;
    wrk->dst += wrk->dst_stride;
  }
 }
 #undef MULT_FIX
 #undef ROUNDER
 //------------------------------------------------------------------------------
 void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk,
                              const uint8_t* const src, int channel);
 void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out);
 extern void WebPRescalerDspInitMIPS32(void);
 extern void WebPRescalerDspInitMIPSdspR2(void);
 static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
    (VP8CPUInfo)&rescaler_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPRescalerImportRow = RescalerImportRowC;
  WebPRescalerExportRow = WebPRescalerExportRowC;
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPRescalerDspInitMIPS32();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPRescalerDspInitMIPSdspR2();
    }
 #endif
  }
  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
--- a/src/loaders/webp/dsp/upsampling.cpp
+++ b/src/loaders/webp/dsp/upsampling.cpp
@ -0,0 +1,252 @@
 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV to RGB upsampling functions.
 //
 // Author: somnath@google.com (Somnath Banerjee)
 #include "./dsp.h"
 #include "./yuv.h"
 #include <assert.h>
 //------------------------------------------------------------------------------
 // Fancy upsampler
 #ifdef FANCY_UPSAMPLING
 // Fancy upsampling functions to convert YUV to RGB
 WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
 // Given samples laid out in a square as:
 //  [a b]
 //  [c d]
 // we interpolate u/v as:
 //  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
 //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
 // We process u and v together stashed into 32bit (16bit each).
 #define LOAD_UV(u, v) ((u) | ((v) << 16))
 #define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* cur_u, const uint8_t* cur_v,              \
                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
  int x;                                                                       \
  const int last_pixel_pair = (len - 1) >> 1;                                  \
  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
  assert(top_y != NULL);                                                       \
  {                                                                            \
    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
  }                                                                            \
  if (bottom_y != NULL) {                                                      \
    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
  }                                                                            \
  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
    /* precompute invariant values associated with first and second diagonals*/\
    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
    {                                                                          \
      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
           top_dst + (2 * x - 1) * XSTEP);                                     \
      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
           top_dst + (2 * x - 0) * XSTEP);                                     \
    }                                                                          \
    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
           bottom_dst + (2 * x - 1) * XSTEP);                                  \
      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
           bottom_dst + (2 * x + 0) * XSTEP);                                  \
    }                                                                          \
    tl_uv = t_uv;                                                              \
    l_uv = uv;                                                                 \
  }                                                                            \
  if (!(len & 1)) {                                                            \
    {                                                                          \
      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
           top_dst + (len - 1) * XSTEP);                                       \
    }                                                                          \
    if (bottom_y != NULL) {                                                    \
      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
           bottom_dst + (len - 1) * XSTEP);                                    \
    }                                                                          \
  }                                                                            \
 }
 // All variants implemented.
 UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
 UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
 UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
 UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
 UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
 UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
 UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
 #undef LOAD_UV
 #undef UPSAMPLE_FUNC
 #endif  // FANCY_UPSAMPLING
 //------------------------------------------------------------------------------
 #if !defined(FANCY_UPSAMPLING)
 #define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                      const uint8_t* top_u, const uint8_t* top_v,              \
                      const uint8_t* bot_u, const uint8_t* bot_v,              \
                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
  const int half_len = len >> 1;                                               \
  int x;                                                                       \
  assert(top_dst != NULL);                                                     \
  {                                                                            \
    for (x = 0; x < half_len; ++x) {                                           \
      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
    }                                                                          \
    if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x);  \
  }                                                                            \
  if (bot_dst != NULL) {                                                       \
    for (x = 0; x < half_len; ++x) {                                           \
      FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0);         \
      FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4);         \
    }                                                                          \
    if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x);  \
  }                                                                            \
 }
 DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
 DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
 #undef DUAL_SAMPLE_FUNC
 #endif  // !FANCY_UPSAMPLING
 WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
  WebPInitUpsamplers();
  VP8YUVInit();
 #ifdef FANCY_UPSAMPLING
  return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
 #else
  return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
 #endif
 }
 //------------------------------------------------------------------------------
 // YUV444 converter
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
 static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
                      uint8_t* dst, int len) {                                 \
  int i;                                                                       \
  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
 YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
 YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
 YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
 YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
 YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
 YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
 YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
 #undef YUV444_FUNC
 WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitYUV444ConvertersMIPSdspR2();
    }
 #endif
  }
  upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
 }
 //------------------------------------------------------------------------------
 // Main calls
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
 static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
 #ifdef FANCY_UPSAMPLING
  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitUpsamplersSSE2();
    }
 #endif
 #if defined(WEBP_USE_NEON)
    if (VP8GetCPUInfo(kNEON)) {
      WebPInitUpsamplersNEON();
    }
 #endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitUpsamplersMIPSdspR2();
    }
 #endif
  }
 #endif  // FANCY_UPSAMPLING
  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/dsp/yuv.cpp
+++ b/src/loaders/webp/dsp/yuv.cpp
@ -0,0 +1,166 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // YUV->RGB conversion functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./yuv.h"
 #if defined(WEBP_YUV_USE_TABLE)
 static int done = 0;
 static WEBP_INLINE uint8_t clip(int v, int max_value) {
  return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 int16_t VP8kVToR[256], VP8kUToB[256];
 int32_t VP8kVToG[256], VP8kUToG[256];
 uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
  int i;
  if (done) {
    return;
  }
 #ifndef USE_YUVj
  for (i = 0; i < 256; ++i) {
    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
    VP8kVToG[i] = -45773 * (i - 128);
    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
  }
  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
  }
 #else
  for (i = 0; i < 256; ++i) {
    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
    VP8kVToG[i] = -46802 * (i - 128);
    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
  }
  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
    const int k = i;
    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
  }
 #endif
  done = 1;
 }
 #else
 WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
 #endif  // WEBP_YUV_USE_TABLE
 //-----------------------------------------------------------------------------
 // Plain-C version
 #define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
 static void FUNC_NAME(const uint8_t* y,                                        \
                      const uint8_t* u, const uint8_t* v,                      \
                      uint8_t* dst, int len) {                                 \
  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
  while (dst != end) {                                                         \
    FUNC(y[0], u[0], v[0], dst);                                               \
    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
    y += 2;                                                                    \
    ++u;                                                                       \
    ++v;                                                                       \
    dst += 2 * XSTEP;                                                          \
  }                                                                            \
  if (len & 1) {                                                               \
    FUNC(y[0], u[0], v[0], dst);                                               \
  }                                                                            \
 }                                                                              \
 // All variants implemented.
 ROW_FUNC(YuvToRgbRow,      VP8YuvToRgb,  3)
 ROW_FUNC(YuvToBgrRow,      VP8YuvToBgr,  3)
 ROW_FUNC(YuvToRgbaRow,     VP8YuvToRgba, 4)
 ROW_FUNC(YuvToBgraRow,     VP8YuvToBgra, 4)
 ROW_FUNC(YuvToArgbRow,     VP8YuvToArgb, 4)
 ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
 ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
 #undef ROW_FUNC
 // Main call for processing a plane with a WebPSamplerRowFunc function:
 void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
                             const uint8_t* u, const uint8_t* v, int uv_stride,
                             uint8_t* dst, int dst_stride,
                             int width, int height, WebPSamplerRowFunc func) {
  int j;
  for (j = 0; j < height; ++j) {
    func(y, u, v, dst, width);
    y += y_stride;
    if (j & 1) {
      u += uv_stride;
      v += uv_stride;
    }
    dst += dst_stride;
  }
 }
 //-----------------------------------------------------------------------------
 // Main call
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersMIPS32(void);
 extern void WebPInitSamplersMIPSdspR2(void);
 static volatile VP8CPUInfo yuv_last_cpuinfo_used =
    (VP8CPUInfo)&yuv_last_cpuinfo_used;
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
  WebPSamplers[MODE_BGRA]      = YuvToBgraRow;
  WebPSamplers[MODE_ARGB]      = YuvToArgbRow;
  WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
  WebPSamplers[MODE_RGB_565]   = YuvToRgb565Row;
  WebPSamplers[MODE_rgbA]      = YuvToRgbaRow;
  WebPSamplers[MODE_bgrA]      = YuvToBgraRow;
  WebPSamplers[MODE_Argb]      = YuvToArgbRow;
  WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
    if (VP8GetCPUInfo(kSSE2)) {
      WebPInitSamplersSSE2();
    }
 #endif  // WEBP_USE_SSE2
 #if defined(WEBP_USE_MIPS32)
    if (VP8GetCPUInfo(kMIPS32)) {
      WebPInitSamplersMIPS32();
    }
 #endif  // WEBP_USE_MIPS32
 #if defined(WEBP_USE_MIPS_DSP_R2)
    if (VP8GetCPUInfo(kMIPSdspR2)) {
      WebPInitSamplersMIPSdspR2();
    }
 #endif  // WEBP_USE_MIPS_DSP_R2
  }
  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
 //-----------------------------------------------------------------------------
--- a/src/loaders/webp/dsp/yuv.h
+++ b/src/loaders/webp/dsp/yuv.h
@ -0,0 +1,321 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // inline YUV<->RGB conversion function
 //
 // The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
 // More information at: http://en.wikipedia.org/wiki/YCbCr
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
 // We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
 //   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
 //   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
 // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
 // So in this case the formulae should read:
 //   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
 //   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
 //   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
 // once factorized.
 // For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
 // That's the maximum possible for a convenient ARM implementation.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 #include "./dsp.h"
 #include "../dec/decode_vp8.h"
 // Define the following to use the LUT-based code:
 // #define WEBP_YUV_USE_TABLE
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
 // This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
 // This colorspace is close to Rec.601's Y'CbCr model with the notable
 // difference of allowing larger range for luma/chroma.
 // See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
 // difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
 // #define USE_YUVj
 #endif
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
 #ifdef __cplusplus
 extern "C" {
 #endif
 enum {
  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
  YUV_HALF = 1 << (YUV_FIX - 1),
  YUV_MASK = (256 << YUV_FIX) - 1,
  YUV_RANGE_MIN = -227,            // min value of r/g/b output
  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
  YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
 #define kYScale 19077    // 1.164 = 255 / 219
 #define kVToR   26149    // 1.596 = 255 / 112 * 0.701
 #define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
 #define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
 #define kUToB   33050    // 2.018 = 255 / 112 * 0.886
 #define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
 #define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
 #define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
 //------------------------------------------------------------------------------
 #if !defined(WEBP_YUV_USE_TABLE)
 // slower on x86 by ~7-8%, but bit-exact with the SSE2 version
 static WEBP_INLINE int VP8Clip8(int v) {
  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
 }
 static WEBP_INLINE int VP8YUVToR(int y, int v) {
  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
 }
 static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
 }
 static WEBP_INLINE int VP8YUVToB(int y, int u) {
  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
 }
 static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                    uint8_t* const rgb) {
  rgb[0] = VP8YUVToR(y, v);
  rgb[1] = VP8YUVToG(y, u, v);
  rgb[2] = VP8YUVToB(y, u);
 }
 static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                    uint8_t* const bgr) {
  bgr[0] = VP8YUVToB(y, u);
  bgr[1] = VP8YUVToG(y, u, v);
  bgr[2] = VP8YUVToR(y, v);
 }
 static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                       uint8_t* const rgb) {
  const int r = VP8YUVToR(y, v);      // 5 usable bits
  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
  const int b = VP8YUVToB(y, u);      // 5 usable bits
  const int rg = (r & 0xf8) | (g >> 5);
  const int gb = ((g << 3) & 0xe0) | (b >> 3);
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
  rgb[1] = rg;
 #else
  rgb[0] = rg;
  rgb[1] = gb;
 #endif
 }
 static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                         uint8_t* const argb) {
  const int r = VP8YUVToR(y, v);        // 4 usable bits
  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
  const int b = VP8YUVToB(y, u);        // 4 usable bits
  const int rg = (r & 0xf0) | (g >> 4);
  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
 #else
  argb[0] = rg;
  argb[1] = ba;
 #endif
 }
 #else
 // Table-based version, not totally equivalent to the SSE2 version.
 // Rounding diff is only +/-1 though.
 extern int16_t VP8kVToR[256], VP8kUToB[256];
 extern int32_t VP8kVToG[256], VP8kUToG[256];
 extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                    uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
 static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                    uint8_t* const bgr) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
 static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                       uint8_t* const rgb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
 #ifdef WEBP_SWAP_16BIT_CSP
  rgb[0] = gb;
  rgb[1] = rg;
 #else
  rgb[0] = rg;
  rgb[1] = gb;
 #endif
 }
 static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                         uint8_t* const argb) {
  const int r_off = VP8kVToR[v];
  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
  const int b_off = VP8kUToB[u];
  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
 #ifdef WEBP_SWAP_16BIT_CSP
  argb[0] = ba;
  argb[1] = rg;
 #else
  argb[0] = rg;
  argb[1] = ba;
 #endif
 }
 #endif  // WEBP_YUV_USE_TABLE
 //-----------------------------------------------------------------------------
 // Alpha handling variants
 static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const argb) {
  argb[0] = 0xff;
  VP8YuvToRgb(y, u, v, argb + 1);
 }
 static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const bgra) {
  VP8YuvToBgr(y, u, v, bgra);
  bgra[3] = 0xff;
 }
 static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
                                     uint8_t* const rgba) {
  VP8YuvToRgb(y, u, v, rgba);
  rgba[3] = 0xff;
 }
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 //-----------------------------------------------------------------------------
 // SSE2 extra functions (mostly for upsampling_sse2.c)
 #if defined(WEBP_USE_SSE2)
 // When the following is defined, tables are initialized statically, adding ~12k
 // to the binary size. Otherwise, they are initialized at run-time (small cost).
 #define WEBP_YUV_USE_SSE2_TABLES
 #if defined(FANCY_UPSAMPLING)
 // Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst);
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst);
 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst);
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                   uint8_t* dst);
 #endif  // FANCY_UPSAMPLING
 // Must be called to initialize tables before using the functions.
 void VP8YUVInitSSE2(void);
 #endif    // WEBP_USE_SSE2
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 // Stub functions that can be called with various rounding values:
 static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
 }
 #ifndef USE_YUVj
 static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
  const int luma = 16839 * r + 33059 * g + 6420 * b;
  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
 }
 static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
  const int u = -9719 * r - 19081 * g + 28800 * b;
  return VP8ClipUV(u, rounding);
 }
 static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
  const int v = +28800 * r - 24116 * g - 4684 * b;
  return VP8ClipUV(v, rounding);
 }
 #else
 // This JPEG-YUV colorspace, only for comparison!
 // These are also 16bit precision coefficients from Rec.601, but with full
 // [0..255] output range.
 static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
  const int luma = 19595 * r + 38470 * g + 7471 * b;
  return (luma + rounding) >> YUV_FIX;  // no need to clip
 }
 static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
  const int u = -11058 * r - 21710 * g + 32768 * b;
  return VP8ClipUV(u, rounding);
 }
 static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
  const int v = 32768 * r - 27439 * g - 5329 * b;
  return VP8ClipUV(v, rounding);
 }
 #endif    // USE_YUVj
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_DSP_YUV_H_ */
--- a/src/loaders/webp/meson.build
+++ b/src/loaders/webp/meson.build
@ -0,0 +1,17 @@
 webp_deb = []
 subdir('dec')
 subdir('dsp')
 subdir('utils')
 subdir('webp')
 source_file = [
   'tvgWebpLoader.h',
   'tvgWebpLoader.cpp',
 ]
 subloader_dep += [declare_dependency(
    include_directories : include_directories('.'),
    dependencies : webp_deb,
    sources : source_file
    )]
--- a/src/loaders/webp/tvgWebpLoader.cpp
+++ b/src/loaders/webp/tvgWebpLoader.cpp
@ -0,0 +1,150 @@
 /*
 * Copyright (c) 2024 the ThorVG project. All rights reserved.
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #include "webp/decode.h"
 #include "tvgWebpLoader.h"
 /************************************************************************/
 /* Internal Class Implementation                                        */
 /************************************************************************/
 void WebpLoader::clear()
 {
    if (freeData) free(data);
    data = nullptr;
    freeData = false;
 }
 void WebpLoader::run(unsigned tid)
 {
    //TODO: we can figure out the requested image format in advance.
    surface.buf8 = WebPDecodeBGRA(data, size, nullptr, nullptr);
    surface.stride = static_cast<uint32_t>(w);
    surface.w = static_cast<uint32_t>(w);
    surface.h = static_cast<uint32_t>(h);
    surface.cs = ColorSpace::ARGB8888;
    surface.channelSize = sizeof(uint32_t);
    surface.premultiplied = false;
    clear();
 }
 /************************************************************************/
 /* External Class Implementation                                        */
 /************************************************************************/
 WebpLoader::WebpLoader() : ImageLoader(FileType::Webp)
 {
 }
 WebpLoader::~WebpLoader()
 {
    clear();
    free(surface.buf8);
 }
 bool WebpLoader::open(const string& path)
 {
    auto f = fopen(path.c_str(), "rb");
    if (!f) return false;
    fseek(f, 0, SEEK_END);
    size = ftell(f);
    if (size == 0) {
        fclose(f);
        return false;
    }
    data = (uint8_t*)malloc(size);
    fseek(f, 0, SEEK_SET);
    auto ret = fread(data, sizeof(char), size, f);
    if (ret < size) {
        fclose(f);
        return false;
    }
    fclose(f);
    int width, height;
    if (!WebPGetInfo(data, size, &width, &height)) return false;
    w = static_cast<float>(width);
    h = static_cast<float>(height);
    freeData = true;
    return true;
 }
 bool WebpLoader::open(const char* data, uint32_t size, TVG_UNUSED const string& rpath, bool copy)
 {
    if (copy) {
        this->data = (uint8_t*) malloc(size);
        if (!this->data) return false;
        memcpy((uint8_t*)this->data, data, size);
        freeData = true;
    } else {
        this->data = (uint8_t*) data;
        freeData = false;
    }
    int width, height;
    if (!WebPGetInfo(this->data, size, &width, &height)) return false;
    w = static_cast<float>(width);
    h = static_cast<float>(height);
    this->size = size;
    return true;
 }
 bool WebpLoader::read()
 {
    if (!LoadModule::read()) return true;
    if (!data || w == 0 || h == 0) return false;
    TaskScheduler::request(this);
    return true;
 }
 bool WebpLoader::close()
 {
    if (!LoadModule::close()) return false;
    this->done();
    return true;
 }
 Surface* WebpLoader::bitmap()
 {
    this->done();
    return ImageLoader::bitmap();
 }
--- a/src/loaders/webp/tvgWebpLoader.h
+++ b/src/loaders/webp/tvgWebpLoader.h
@ -0,0 +1,48 @@
 /*
 * Copyright (c) 2024 the ThorVG project. All rights reserved.
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
 #ifndef _TVG_WEBP_LOADER_H_
 #define _TVG_WEBP_LOADER_H_
 #include "tvgLoader.h"
 #include "tvgTaskScheduler.h"
 class WebpLoader : public ImageLoader, public Task
 {
 private:
    uint8_t* data = nullptr;
    uint32_t size = 0;
    bool freeData = false;
    void clear();
    void run(unsigned tid) override;
 public:
    WebpLoader();
    ~WebpLoader();
    bool open(const string& path) override;
    bool open(const char* data, uint32_t size, const string& rpath, bool copy) override;
    bool read() override;
    bool close() override;
    Surface* bitmap() override;
 };
 #endif //_TVG_WEBP_LOADER_H_
--- a/src/loaders/webp/utils/bit_reader.cpp
+++ b/src/loaders/webp/utils/bit_reader.cpp
@ -0,0 +1,216 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Boolean decoder non-inlined methods
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifdef HAVE_CONFIG_H
 #include "../webp/config.h"
 #endif
 #include "./bit_reader_inl.h"
 //------------------------------------------------------------------------------
 // VP8BitReader
 void VP8InitBitReader(VP8BitReader* const br,
                      const uint8_t* const start, const uint8_t* const end) {
  assert(br != NULL);
  assert(start != NULL);
  assert(start <= end);
  br->range_   = 255 - 1;
  br->buf_     = start;
  br->buf_end_ = end;
  br->value_   = 0;
  br->bits_    = -8;   // to load the very first 8bits
  br->eof_     = 0;
  VP8LoadNewBytes(br);
 }
 void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
  if (br->buf_ != NULL) {
    br->buf_ += offset;
    br->buf_end_ += offset;
  }
 }
 const uint8_t kVP8Log2Range[128] = {
     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  0
 };
 // range = ((range - 1) << kVP8Log2Range[range]) + 1
 const uint8_t kVP8NewRange[128] = {
  127, 127, 191, 127, 159, 191, 223, 127,
  143, 159, 175, 191, 207, 223, 239, 127,
  135, 143, 151, 159, 167, 175, 183, 191,
  199, 207, 215, 223, 231, 239, 247, 127,
  131, 135, 139, 143, 147, 151, 155, 159,
  163, 167, 171, 175, 179, 183, 187, 191,
  195, 199, 203, 207, 211, 215, 219, 223,
  227, 231, 235, 239, 243, 247, 251, 127,
  129, 131, 133, 135, 137, 139, 141, 143,
  145, 147, 149, 151, 153, 155, 157, 159,
  161, 163, 165, 167, 169, 171, 173, 175,
  177, 179, 181, 183, 185, 187, 189, 191,
  193, 195, 197, 199, 201, 203, 205, 207,
  209, 211, 213, 215, 217, 219, 221, 223,
  225, 227, 229, 231, 233, 235, 237, 239,
  241, 243, 245, 247, 249, 251, 253, 127
 };
 void VP8LoadFinalBytes(VP8BitReader* const br) {
  assert(br != NULL && br->buf_ != NULL);
  // Only read 8bits at a time
  if (br->buf_ < br->buf_end_) {
    br->bits_ += 8;
    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
  } else if (!br->eof_) {
    br->value_ <<= 8;
    br->bits_ += 8;
    br->eof_ = 1;
  } else {
    br->bits_ = 0;  // This is to avoid undefined behaviour with shifts.
  }
 }
 //------------------------------------------------------------------------------
 // Higher-level calls
 uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
  uint32_t v = 0;
  while (bits-- > 0) {
    v |= VP8GetBit(br, 0x80) << bits;
  }
  return v;
 }
 int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
  const int value = VP8GetValue(br, bits);
  return VP8Get(br) ? -value : value;
 }
 //------------------------------------------------------------------------------
 // VP8LBitReader
 #define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 #if !defined(WEBP_FORCE_ALIGNED) && \
    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
     defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64))
 #define VP8L_USE_UNALIGNED_LOAD
 #endif
 static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
  0,
  0x000001, 0x000003, 0x000007, 0x00000f,
  0x00001f, 0x00003f, 0x00007f, 0x0000ff,
  0x0001ff, 0x0003ff, 0x0007ff, 0x000fff,
  0x001fff, 0x003fff, 0x007fff, 0x00ffff,
  0x01ffff, 0x03ffff, 0x07ffff, 0x0fffff,
  0x1fffff, 0x3fffff, 0x7fffff, 0xffffff
 };
 void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
                       size_t length) {
  size_t i;
  vp8l_val_t value = 0;
  assert(br != NULL);
  assert(start != NULL);
  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
  br->len_ = length;
  br->val_ = 0;
  br->bit_pos_ = 0;
  br->eos_ = 0;
  if (length > sizeof(br->val_)) {
    length = sizeof(br->val_);
  }
  for (i = 0; i < length; ++i) {
    value |= (vp8l_val_t)start[i] << (8 * i);
  }
  br->val_ = value;
  br->pos_ = length;
  br->buf_ = start;
 }
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                            const uint8_t* const buf, size_t len) {
  assert(br != NULL);
  assert(buf != NULL);
  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
  br->buf_ = buf;
  br->len_ = len;
  // pos_ > len_ should be considered a param error.
  br->eos_ = (br->pos_ > br->len_) || VP8LIsEndOfStream(br);
 }
 static void VP8LSetEndOfStream(VP8LBitReader* const br) {
  br->eos_ = 1;
  br->bit_pos_ = 0;  // To avoid undefined behaviour with shifts.
 }
 // If not at EOS, reload up to VP8L_LBITS byte-by-byte
 static void ShiftBytes(VP8LBitReader* const br) {
  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
    br->val_ >>= 8;
    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
    ++br->pos_;
    br->bit_pos_ -= 8;
  }
  if (VP8LIsEndOfStream(br)) {
    VP8LSetEndOfStream(br);
  }
 }
 void VP8LDoFillBitWindow(VP8LBitReader* const br) {
  assert(br->bit_pos_ >= VP8L_WBITS);
  // TODO(jzern): given the fixed read size it may be possible to force
  //              alignment in this block.
 #if defined(VP8L_USE_UNALIGNED_LOAD)
  if (br->pos_ + sizeof(br->val_) < br->len_) {
    br->val_ >>= VP8L_WBITS;
    br->bit_pos_ -= VP8L_WBITS;
    // The expression below needs a little-endian arch to work correctly.
    // This gives a large speedup for decoding speed.
    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
                (VP8L_LBITS - VP8L_WBITS);
    br->pos_ += VP8L_LOG8_WBITS;
    return;
  }
 #endif
  ShiftBytes(br);       // Slow path.
 }
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
  assert(n_bits >= 0);
  // Flag an error if end_of_stream or n_bits is more than allowed limit.
  if (!br->eos_ && n_bits <= VP8L_MAX_NUM_BIT_READ) {
    const uint32_t val = VP8LPrefetchBits(br) & kBitMask[n_bits];
    const int new_bits = br->bit_pos_ + n_bits;
    br->bit_pos_ = new_bits;
    ShiftBytes(br);
    return val;
  } else {
    VP8LSetEndOfStream(br);
    return 0;
  }
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/utils/bit_reader.h
+++ b/src/loaders/webp/utils/bit_reader.h
@ -0,0 +1,168 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Boolean decoder
 //
 // Author: Skal (pascal.massimino@gmail.com)
 //         Vikas Arora (vikaas.arora@gmail.com)
 #ifndef WEBP_UTILS_BIT_READER_H_
 #define WEBP_UTILS_BIT_READER_H_
 #include <assert.h>
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // The Boolean decoder needs to maintain infinite precision on the value_ field.
 // However, since range_ is only 8bit, we only need an active window of 8 bits
 // for value_. Left bits (MSB) gets zeroed and shifted away when value_ falls
 // below 128, range_ is updated, and fresh bits read from the bitstream are
 // brought in as LSB. To avoid reading the fresh bits one by one (slow), we
 // cache BITS of them ahead. The total of (BITS + 8) bits must fit into a
 // natural register (with type bit_t). To fetch BITS bits from bitstream we
 // use a type lbit_t.
 //
 // BITS can be any multiple of 8 from 8 to 56 (inclusive).
 // Pick values that fit natural register size.
 #if defined(__i386__) || defined(_M_IX86)      // x86 32bit
 #define BITS 24
 #elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
 #define BITS 56
 #elif defined(__arm__) || defined(_M_ARM)      // ARM
 #define BITS 24
 #elif defined(__mips__)                        // MIPS
 #define BITS 24
 #else                                          // reasonable default
 #define BITS 24  // TODO(skal): test aarch64 and find the proper BITS value.
 #endif
 //------------------------------------------------------------------------------
 // Derived types and constants:
 //   bit_t = natural register type for storing 'value_' (which is BITS+8 bits)
 //   range_t = register for 'range_' (which is 8bits only)
 #if (BITS > 24)
 typedef uint64_t bit_t;
 #else
 typedef uint32_t bit_t;
 #endif
 typedef uint32_t range_t;
 //------------------------------------------------------------------------------
 // Bitreader
 typedef struct VP8BitReader VP8BitReader;
 struct VP8BitReader {
  // boolean decoder  (keep the field ordering as is!)
  bit_t value_;               // current value
  range_t range_;             // current range minus 1. In [127, 254] interval.
  int bits_;                  // number of valid bits left
  // read buffer
  const uint8_t* buf_;        // next byte to be read
  const uint8_t* buf_end_;    // end of read buffer
  int eof_;                   // true if input is exhausted
 };
 // Initialize the bit reader and the boolean decoder.
 void VP8InitBitReader(VP8BitReader* const br,
                      const uint8_t* const start, const uint8_t* const end);
 // Update internal pointers to displace the byte buffer by the
 // relative offset 'offset'.
 void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
 // return the next value made of 'num_bits' bits
 uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
 static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
  return VP8GetValue(br, 1);
 }
 // return the next value with sign-extension.
 int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
 // bit_reader_inl.h will implement the following methods:
 //   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
 //   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
 // and should be included by the .c files that actually need them.
 // This is to avoid recompiling the whole library whenever this file is touched,
 // and also allowing platform-specific ad-hoc hacks.
 // -----------------------------------------------------------------------------
 // Bitreader for lossless format
 // maximum number of bits (inclusive) the bit-reader can handle:
 #define VP8L_MAX_NUM_BIT_READ 24
 #define VP8L_LBITS 64  // Number of bits prefetched (= bit-size of vp8l_val_t).
 #define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
 typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
 typedef struct {
  vp8l_val_t     val_;        // pre-fetched bits
  const uint8_t* buf_;        // input byte buffer
  size_t         len_;        // buffer length
  size_t         pos_;        // byte position in buf_
  int            bit_pos_;    // current bit-reading position in val_
  int            eos_;        // true if a bit was read past the end of buffer
 } VP8LBitReader;
 void VP8LInitBitReader(VP8LBitReader* const br,
                       const uint8_t* const start,
                       size_t length);
 //  Sets a new data buffer.
 void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
                            const uint8_t* const buffer, size_t length);
 // Reads the specified number of bits from read buffer.
 // Flags an error in case end_of_stream or n_bits is more than the allowed limit
 // of VP8L_MAX_NUM_BIT_READ (inclusive).
 // Flags eos_ if this read attempt is going to cross the read buffer.
 uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
 // Return the prefetched bits, so they can be looked up.
 static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
  return (uint32_t)(br->val_ >> (br->bit_pos_ & (VP8L_LBITS - 1)));
 }
 // Returns true if there was an attempt at reading bit past the end of
 // the buffer. Doesn't set br->eos_ flag.
 static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
  assert(br->pos_ <= br->len_);
  return br->eos_ || ((br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS));
 }
 // For jumping over a number of bits in the bit stream when accessed with
 // VP8LPrefetchBits and VP8LFillBitWindow.
 static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
  br->bit_pos_ = val;
  br->eos_ = VP8LIsEndOfStream(br);
 }
 // Advances the read buffer by 4 bytes to make room for reading next 32 bits.
 // Speed critical, but infrequent part of the code can be non-inlined.
 extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
 static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
 }
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_BIT_READER_H_ */
--- a/src/loaders/webp/utils/bit_reader_inl.h
+++ b/src/loaders/webp/utils/bit_reader_inl.h
@ -0,0 +1,172 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Specific inlined methods for boolean decoder [VP8GetBit() ...]
 // This file should be included by the .c sources that actually need to call
 // these methods.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_UTILS_BIT_READER_INL_H_
 #define WEBP_UTILS_BIT_READER_INL_H_
 #ifdef HAVE_CONFIG_H
 #include "../webp/config.h"
 #endif
 #ifdef WEBP_FORCE_ALIGNED
 #include <string.h>  // memcpy
 #endif
 #include "../dsp/dsp.h"
 #include "./bit_reader.h"
 #include "./endian_inl.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Derived type lbit_t = natural type for memory I/O
 #if   (BITS > 32)
 typedef uint64_t lbit_t;
 #elif (BITS > 16)
 typedef uint32_t lbit_t;
 #elif (BITS >  8)
 typedef uint16_t lbit_t;
 #else
 typedef uint8_t lbit_t;
 #endif
 extern const uint8_t kVP8Log2Range[128];
 extern const uint8_t kVP8NewRange[128];
 // special case for the tail byte-reading
 void VP8LoadFinalBytes(VP8BitReader* const br);
 //------------------------------------------------------------------------------
 // Inlined critical functions
 // makes sure br->value_ has at least BITS bits worth of data
 static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
  assert(br != NULL && br->buf_ != NULL);
  // Read 'BITS' bits at a time if possible.
  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
    // convert memory type to register type (with some zero'ing!)
    bit_t bits;
 #if defined(WEBP_FORCE_ALIGNED)
    lbit_t in_bits;
    memcpy(&in_bits, br->buf_, sizeof(in_bits));
 #elif defined(WEBP_USE_MIPS32)
    // This is needed because of un-aligned read.
    lbit_t in_bits;
    lbit_t* p_buf_ = (lbit_t*)br->buf_;
    __asm__ volatile(
      ".set   push                             \n\t"
      ".set   at                               \n\t"
      ".set   macro                            \n\t"
      "ulw    %[in_bits], 0(%[p_buf_])         \n\t"
      ".set   pop                              \n\t"
      : [in_bits]"=r"(in_bits)
      : [p_buf_]"r"(p_buf_)
      : "memory", "at"
    );
 #else
    const lbit_t in_bits = *(const lbit_t*)br->buf_;
 #endif
    br->buf_ += BITS >> 3;
 #if !defined(WORDS_BIGENDIAN)
 #if (BITS > 32)
    bits = BSwap64(in_bits);
    bits >>= 64 - BITS;
 #elif (BITS >= 24)
    bits = BSwap32(in_bits);
    bits >>= (32 - BITS);
 #elif (BITS == 16)
    bits = BSwap16(in_bits);
 #else   // BITS == 8
    bits = (bit_t)in_bits;
 #endif  // BITS > 32
 #else    // WORDS_BIGENDIAN
    bits = (bit_t)in_bits;
    if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
 #endif
    br->value_ = bits | (br->value_ << BITS);
    br->bits_ += BITS;
  } else {
    VP8LoadFinalBytes(br);    // no need to be inlined
  }
 }
 // Read a bit with proba 'prob'. Speed-critical function!
 static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
  // Don't move this declaration! It makes a big speed difference to store
  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
  // alter br->range_ value.
  range_t range = br->range_;
  if (br->bits_ < 0) {
    VP8LoadNewBytes(br);
  }
  {
    const int pos = br->bits_;
    const range_t split = (range * prob) >> 8;
    const range_t value = (range_t)(br->value_ >> pos);
 #if defined(__arm__) || defined(_M_ARM)      // ARM-specific
    const int bit = ((int)(split - value) >> 31) & 1;
    if (value > split) {
      range -= split + 1;
      br->value_ -= (bit_t)(split + 1) << pos;
    } else {
      range = split;
    }
 #else  // faster version on x86
    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
    if (value > split) {
      range -= split + 1;
      br->value_ -= (bit_t)(split + 1) << pos;
      bit = 1;
    } else {
      range = split;
      bit = 0;
    }
 #endif
    if (range <= (range_t)0x7e) {
      const int shift = kVP8Log2Range[range];
      range = kVP8NewRange[range];
      br->bits_ -= shift;
    }
    br->range_ = range;
    return bit;
  }
 }
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
 static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
  if (br->bits_ < 0) {
    VP8LoadNewBytes(br);
  }
  {
    const int pos = br->bits_;
    const range_t split = br->range_ >> 1;
    const range_t value = (range_t)(br->value_ >> pos);
    const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
    br->bits_ -= 1;
    br->range_ += mask;
    br->range_ |= 1;
    br->value_ -= (bit_t)((split + 1) & mask) << pos;
    return (v ^ mask) - mask;
  }
 }
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif   // WEBP_UTILS_BIT_READER_INL_H_
--- a/src/loaders/webp/utils/color_cache.cpp
+++ b/src/loaders/webp/utils/color_cache.cpp
@ -0,0 +1,48 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
 //
 // Author: Jyrki Alakuijala (jyrki@google.com)
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "./color_cache.h"
 #include "../utils/utils.h"
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
  const int hash_size = 1 << hash_bits;
  assert(cc != NULL);
  assert(hash_bits > 0);
  cc->colors_ = (uint32_t*)calloc((uint64_t)hash_size, sizeof(*cc->colors_));
  if (cc->colors_ == NULL) return 0;
  cc->hash_shift_ = 32 - hash_bits;
  cc->hash_bits_ = hash_bits;
  return 1;
 }
 void VP8LColorCacheClear(VP8LColorCache* const cc) {
  if (cc != NULL) {
    free(cc->colors_);
    cc->colors_ = NULL;
  }
 }
 void VP8LColorCacheCopy(const VP8LColorCache* const src,
                        VP8LColorCache* const dst) {
  assert(src != NULL);
  assert(dst != NULL);
  assert(src->hash_bits_ == dst->hash_bits_);
  memcpy(dst->colors_, src->colors_,
         ((size_t)1u << dst->hash_bits_) * sizeof(*dst->colors_));
 }
--- a/src/loaders/webp/utils/color_cache.h
+++ b/src/loaders/webp/utils/color_cache.h
@ -0,0 +1,74 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Color Cache for WebP Lossless
 //
 // Authors: Jyrki Alakuijala (jyrki@google.com)
 //          Urvang Joshi (urvang@google.com)
 #ifndef WEBP_UTILS_COLOR_CACHE_H_
 #define WEBP_UTILS_COLOR_CACHE_H_
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Main color cache struct.
 typedef struct {
  uint32_t *colors_;  // color entries
  int hash_shift_;    // Hash shift: 32 - hash_bits_.
  int hash_bits_;
 } VP8LColorCache;
 static const uint32_t kHashMul = 0x1e35a7bd;
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
    const VP8LColorCache* const cc, uint32_t key) {
  assert(key <= (~0U >> cc->hash_shift_));
  return cc->colors_[key];
 }
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                             uint32_t argb) {
  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
  cc->colors_[key] = argb;
 }
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                              uint32_t argb) {
  return (kHashMul * argb) >> cc->hash_shift_;
 }
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                              uint32_t argb) {
  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
  return cc->colors_[key] == argb;
 }
 //------------------------------------------------------------------------------
 // Initializes the color cache with 'hash_bits' bits for the keys.
 // Returns false in case of memory error.
 int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits);
 void VP8LColorCacheCopy(const VP8LColorCache* const src,
                        VP8LColorCache* const dst);
 // Delete the memory associated to color cache.
 void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }
 #endif
 #endif  // WEBP_UTILS_COLOR_CACHE_H_
--- a/src/loaders/webp/utils/endian_inl.h
+++ b/src/loaders/webp/utils/endian_inl.h
@ -0,0 +1,100 @@
 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Endian related functions.
 #ifndef WEBP_UTILS_ENDIAN_INL_H_
 #define WEBP_UTILS_ENDIAN_INL_H_
 #ifdef HAVE_CONFIG_H
 #include "../webp/config.h"
 #endif
 #include "../dsp/dsp.h"
 #include "../webp/types.h"
 // some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
 #if !defined(WORDS_BIGENDIAN) && \
    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
 #define WORDS_BIGENDIAN
 #endif
 #if defined(WORDS_BIGENDIAN)
 #define HToLE32 BSwap32
 #define HToLE16 BSwap16
 #else
 #define HToLE32(x) (x)
 #define HToLE16(x) (x)
 #endif
 #if !defined(HAVE_CONFIG_H)
 // clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
 #if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
 #define HAVE_BUILTIN_BSWAP32
 #define HAVE_BUILTIN_BSWAP64
 #endif
 // clang-3.3 and gcc-4.8 have a builtin function for swap16
 #if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
 #define HAVE_BUILTIN_BSWAP16
 #endif
 #endif  // !HAVE_CONFIG_H
 static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
 #if defined(HAVE_BUILTIN_BSWAP16)
  return __builtin_bswap16(x);
 #elif defined(_MSC_VER)
  return _byteswap_ushort(x);
 #else
  // gcc will recognize a 'rorw $8, ...' here:
  return (x >> 8) | ((x & 0xff) << 8);
 #endif  // HAVE_BUILTIN_BSWAP16
 }
 static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
 #if defined(WEBP_USE_MIPS32_R2)
  uint32_t ret;
  __asm__ volatile (
    "wsbh   %[ret], %[x]          \n\t"
    "rotr   %[ret], %[ret],  16   \n\t"
    : [ret]"=r"(ret)
    : [x]"r"(x)
  );
  return ret;
 #elif defined(HAVE_BUILTIN_BSWAP32)
  return __builtin_bswap32(x);
 #elif defined(__i386__) || defined(__x86_64__)
  uint32_t swapped_bytes;
  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
  return swapped_bytes;
 #elif defined(_MSC_VER)
  return (uint32_t)_byteswap_ulong(x);
 #else
  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
 #endif  // HAVE_BUILTIN_BSWAP32
 }
 static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
 #if defined(HAVE_BUILTIN_BSWAP64)
  return __builtin_bswap64(x);
 #elif defined(__x86_64__)
  uint64_t swapped_bytes;
  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
  return swapped_bytes;
 #elif defined(_MSC_VER)
  return (uint64_t)_byteswap_uint64(x);
 #else  // generic code for swapping 64-bit values (suggested by bdb@)
  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
  return x;
 #endif  // HAVE_BUILTIN_BSWAP64
 }
 #endif  // WEBP_UTILS_ENDIAN_INL_H_
--- a/src/loaders/webp/utils/huffman.cpp
+++ b/src/loaders/webp/utils/huffman.cpp
@ -0,0 +1,204 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
 //
 // Author: Urvang Joshi (urvang@google.com)
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "./huffman.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 // Huffman data read via DecodeImageStream is represented in two (red and green)
 // bytes.
 #define MAX_HTREE_GROUPS    0x10000
 HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups) {
  HTreeGroup* const htree_groups = (HTreeGroup*)malloc(num_htree_groups * sizeof(*htree_groups));
  if (htree_groups == NULL) {
    return NULL;
  }
  assert(num_htree_groups <= MAX_HTREE_GROUPS);
  return htree_groups;
 }
 void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups) {
  if (htree_groups != NULL) {
    free(htree_groups);
  }
 }
 // Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
 // bit-wise reversal of the len least significant bits of key.
 static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
  uint32_t step = 1 << (len - 1);
  while (key & step) {
    step >>= 1;
  }
  return (key & (step - 1)) + step;
 }
 // Stores code in table[0], table[step], table[2*step], ..., table[end].
 // Assumes that end is an integer multiple of step.
 static WEBP_INLINE void ReplicateValue(HuffmanCode* table,
                                       int step, int end,
                                       HuffmanCode code) {
  assert(end % step == 0);
  do {
    end -= step;
    table[end] = code;
  } while (end > 0);
 }
 // Returns the table width of the next 2nd level table. count is the histogram
 // of bit lengths for the remaining symbols, len is the code length of the next
 // processed symbol
 static WEBP_INLINE int NextTableBitSize(const int* const count,
                                        int len, int root_bits) {
  int left = 1 << (len - root_bits);
  while (len < MAX_ALLOWED_CODE_LENGTH) {
    left -= count[len];
    if (left <= 0) break;
    ++len;
    left <<= 1;
  }
  return len - root_bits;
 }
 int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                          const int code_lengths[], int code_lengths_size) {
  HuffmanCode* table = root_table;  // next available space in table
  int total_size = 1 << root_bits;  // total size root table + 2nd level table
  int* sorted = NULL;               // symbols sorted by code length
  int len;                          // current code length
  int symbol;                       // symbol index in original or sorted table
  // number of codes of each length:
  int count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
  // offsets in sorted table for each length:
  int offset[MAX_ALLOWED_CODE_LENGTH + 1];
  assert(code_lengths_size != 0);
  assert(code_lengths != NULL);
  assert(root_table != NULL);
  assert(root_bits > 0);
  // Build histogram of code lengths.
  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
    if (code_lengths[symbol] > MAX_ALLOWED_CODE_LENGTH) {
      return 0;
    }
    ++count[code_lengths[symbol]];
  }
  // Error, all code lengths are zeros.
  if (count[0] == code_lengths_size) {
    return 0;
  }
  // Generate offsets into sorted symbol table by code length.
  offset[1] = 0;
  for (len = 1; len < MAX_ALLOWED_CODE_LENGTH; ++len) {
    if (count[len] > (1 << len)) {
      return 0;
    }
    offset[len + 1] = offset[len] + count[len];
  }
  sorted = (int*)malloc(code_lengths_size * sizeof(*sorted));
  if (sorted == NULL) {
    return 0;
  }
  // Sort symbols by length, by symbol order within each length.
  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
    const int symbol_code_length = code_lengths[symbol];
    if (code_lengths[symbol] > 0) {
      sorted[offset[symbol_code_length]++] = symbol;
    }
  }
  // Special case code with only one value.
  if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
    HuffmanCode code;
    code.bits = 0;
    code.value = (uint16_t)sorted[0];
    ReplicateValue(table, 1, total_size, code);
    free(sorted);
    return total_size;
  }
  {
    int step;              // step size to replicate values in current table
    uint32_t low = -1;     // low bits for current root entry
    uint32_t mask = total_size - 1;    // mask for low bits
    uint32_t key = 0;      // reversed prefix code
    int num_nodes = 1;     // number of Huffman tree nodes
    int num_open = 1;      // number of open branches in current tree level
    int table_bits = root_bits;        // key length of current table
    int table_size = 1 << table_bits;  // size of current table
    symbol = 0;
    // Fill in root table.
    for (len = 1, step = 2; len <= root_bits; ++len, step <<= 1) {
      num_open <<= 1;
      num_nodes += num_open;
      num_open -= count[len];
      if (num_open < 0) {
        free(sorted);
        return 0;
      }
      for (; count[len] > 0; --count[len]) {
        HuffmanCode code;
        code.bits = (uint8_t)len;
        code.value = (uint16_t)sorted[symbol++];
        ReplicateValue(&table[key], step, table_size, code);
        key = GetNextKey(key, len);
      }
    }
    // Fill in 2nd level tables and add pointers to root table.
    for (len = root_bits + 1, step = 2; len <= MAX_ALLOWED_CODE_LENGTH;
         ++len, step <<= 1) {
      num_open <<= 1;
      num_nodes += num_open;
      num_open -= count[len];
      if (num_open < 0) {
        free(sorted);
        return 0;
      }
      for (; count[len] > 0; --count[len]) {
        HuffmanCode code;
        if ((key & mask) != low) {
          table += table_size;
          table_bits = NextTableBitSize(count, len, root_bits);
          table_size = 1 << table_bits;
          total_size += table_size;
          low = key & mask;
          root_table[low].bits = (uint8_t)(table_bits + root_bits);
          root_table[low].value = (uint16_t)((table - root_table) - low);
        }
        code.bits = (uint8_t)(len - root_bits);
        code.value = (uint16_t)sorted[symbol++];
        ReplicateValue(&table[key >> root_bits], step, table_size, code);
        key = GetNextKey(key, len);
      }
    }
    // Check if tree is full.
    if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
      free(sorted);
      return 0;
    }
  }
  free(sorted);
  return total_size;
 }
--- a/src/loaders/webp/utils/huffman.h
+++ b/src/loaders/webp/utils/huffman.h
@ -0,0 +1,67 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Utilities for building and looking up Huffman trees.
 //
 // Author: Urvang Joshi (urvang@google.com)
 #ifndef WEBP_UTILS_HUFFMAN_H_
 #define WEBP_UTILS_HUFFMAN_H_
 #include <assert.h>
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define HUFFMAN_TABLE_BITS      8
 #define HUFFMAN_TABLE_MASK      ((1 << HUFFMAN_TABLE_BITS) - 1)
 #define LENGTHS_TABLE_BITS      7
 #define LENGTHS_TABLE_MASK      ((1 << LENGTHS_TABLE_BITS) - 1)
 // Huffman lookup table entry
 typedef struct {
  uint8_t bits;     // number of bits used for this symbol
  uint16_t value;   // symbol value or table offset
 } HuffmanCode;
 // Huffman table group.
 typedef struct HTreeGroup HTreeGroup;
 struct HTreeGroup {
  HuffmanCode* htrees[HUFFMAN_CODES_PER_META_CODE];
  int      is_trivial_literal;  // True, if huffman trees for Red, Blue & Alpha
                                // Symbols are trivial (have a single code).
  uint32_t literal_arb;         // If is_trivial_literal is true, this is the
                                // ARGB value of the pixel, with Green channel
                                // being set to zero.
 };
 // Creates the instance of HTreeGroup with specified number of tree-groups.
 HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
 // Releases the memory allocated for HTreeGroup.
 void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
 // Builds Huffman lookup table assuming code lengths are in symbol order.
 // The 'code_lengths' is pre-allocated temporary memory buffer used for creating
 // the huffman table.
 // Returns built table size or 0 in case of error (invalid tree or
 // memory error).
 int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
                          const int code_lengths[], int code_lengths_size);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  // WEBP_UTILS_HUFFMAN_H_
--- a/src/loaders/webp/utils/meson.build
+++ b/src/loaders/webp/utils/meson.build
@ -0,0 +1,22 @@
 source_file = [
   'bit_reader.h',
   'bit_reader_inl.h',
   'color_cache.h',
   'endian_inl.h',
   'huffman.h',
   'quant_levels_dec.h',
   'random.h',
   'rescaler.h',
   'utils.h',
   'bit_reader.cpp',
   'color_cache.cpp',
   'huffman.cpp',
   'quant_levels_dec.cpp',
   'random.cpp',
   'rescaler.cpp'
 ]
 webp_deb += [declare_dependency(
    include_directories : include_directories('.'),
    sources : source_file
 )]
--- a/src/loaders/webp/utils/quant_levels_dec.cpp
+++ b/src/loaders/webp/utils/quant_levels_dec.cpp
@ -0,0 +1,279 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Implement gradient smoothing: we replace a current alpha value by its
 // surrounding average if it's close enough (that is: the change will be less
 // than the minimum distance between two quantized level).
 // We use sliding window for computing the 2d moving average.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include "./quant_levels_dec.h"
 #include "./utils.h"
 #include <stdlib.h>
 #include <string.h>   // for memset
 // #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
 #define FIX 16     // fix-point precision for averaging
 #define LFIX 2     // extra precision for look-up table
 #define LUT_SIZE ((1 << (8 + LFIX)) - 1)  // look-up table size
 #if defined(USE_DITHERING)
 #define DFIX 4           // extra precision for ordered dithering
 #define DSIZE 4          // dithering size (must be a power of two)
 // cf. http://en.wikipedia.org/wiki/Ordered_dithering
 static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
  {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
  { 12,  4, 14,  6 },
  {  3, 11,  1,  9 },
  { 15,  7, 13,  5 }
 };
 #else
 #define DFIX 0
 #endif
 typedef struct {
  int width_, height_;  // dimension
  int row_;             // current input row being processed
  uint8_t* src_;        // input pointer
  uint8_t* dst_;        // output pointer
  int radius_;          // filter radius (=delay)
  int scale_;           // normalization factor, in FIX bits precision
  void* mem_;           // all memory
  // various scratch buffers
  uint16_t* start_;
  uint16_t* cur_;
  uint16_t* end_;
  uint16_t* top_;
  uint16_t* average_;
  // input levels distribution
  int num_levels_;       // number of quantized levels
  int min_, max_;        // min and max level values
  int min_level_dist_;   // smallest distance between two consecutive levels
  int16_t* correction_;  // size = 1 + 2*LUT_SIZE  -> ~4k memory
 } SmoothParams;
 //------------------------------------------------------------------------------
 #define CLIP_MASK (int)(~0U << (8 + DFIX))
 static WEBP_INLINE uint8_t clip_8b(int v) {
  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
 }
 // vertical accumulation
 static void VFilter(SmoothParams* const p) {
  const uint8_t* src = p->src_;
  const int w = p->width_;
  uint16_t* const cur = p->cur_;
  const uint16_t* const top = p->top_;
  uint16_t* const out = p->end_;
  uint16_t sum = 0;               // all arithmetic is modulo 16bit
  int x;
  for (x = 0; x < w; ++x) {
    uint16_t new_value;
    sum += src[x];
    new_value = top[x] + sum;
    out[x] = new_value - cur[x];  // vertical sum of 'r' pixels.
    cur[x] = new_value;
  }
  // move input pointers one row down
  p->top_ = p->cur_;
  p->cur_ += w;
  if (p->cur_ == p->end_) p->cur_ = p->start_;  // roll-over
  // We replicate edges, as it's somewhat easier as a boundary condition.
  // That's why we don't update the 'src' pointer on top/bottom area:
  if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
    p->src_ += p->width_;
  }
 }
 // horizontal accumulation. We use mirror replication of missing pixels, as it's
 // a little easier to implement (surprisingly).
 static void HFilter(SmoothParams* const p) {
  const uint16_t* const in = p->end_;
  uint16_t* const out = p->average_;
  const uint32_t scale = p->scale_;
  const int w = p->width_;
  const int r = p->radius_;
  int x;
  for (x = 0; x <= r; ++x) {   // left mirroring
    const uint16_t delta = in[x + r - 1] + in[r - x];
    out[x] = (delta * scale) >> FIX;
  }
  for (; x < w - r; ++x) {     // bulk middle run
    const uint16_t delta = in[x + r] - in[x - r - 1];
    out[x] = (delta * scale) >> FIX;
  }
  for (; x < w; ++x) {         // right mirroring
    const uint16_t delta =
        2 * in[w - 1] - in[2 * w - 2 - r - x] - in[x - r - 1];
    out[x] = (delta * scale) >> FIX;
  }
 }
 // emit one filtered output row
 static void ApplyFilter(SmoothParams* const p) {
  const uint16_t* const average = p->average_;
  const int w = p->width_;
  const int16_t* const correction = p->correction_;
 #if defined(USE_DITHERING)
  const uint8_t* const dither = kOrderedDither[p->row_ % DSIZE];
 #endif
  uint8_t* const dst = p->dst_;
  int x;
  for (x = 0; x < w; ++x) {
    const int v = dst[x];
    if (v < p->max_ && v > p->min_) {
      const int c = (v << DFIX) + correction[average[x] - (v << LFIX)];
 #if defined(USE_DITHERING)
      dst[x] = clip_8b(c + dither[x % DSIZE]);
 #else
      dst[x] = clip_8b(c);
 #endif
    }
  }
  p->dst_ += w;  // advance output pointer
 }
 //------------------------------------------------------------------------------
 // Initialize correction table
 static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
  // The correction curve is:
  //   f(x) = x for x <= threshold2
  //   f(x) = 0 for x >= threshold1
  // and a linear interpolation for range x=[threshold2, threshold1]
  // (along with f(-x) = -f(x) symmetry).
  // Note that: threshold2 = 3/4 * threshold1
  const int threshold1 = min_dist << LFIX;
  const int threshold2 = (3 * threshold1) >> 2;
  const int max_threshold = threshold2 << DFIX;
  const int delta = threshold1 - threshold2;
  int i;
  for (i = 1; i <= LUT_SIZE; ++i) {
    int c = (i <= threshold2) ? (i << DFIX)
          : (i < threshold1) ? max_threshold * (threshold1 - i) / delta
          : 0;
    c >>= LFIX;
    lut[+i] = +c;
    lut[-i] = -c;
  }
  lut[0] = 0;
 }
 static void CountLevels(const uint8_t* const data, int size,
                        SmoothParams* const p) {
  int i, last_level;
  uint8_t used_levels[256] = { 0 };
  p->min_ = 255;
  p->max_ = 0;
  for (i = 0; i < size; ++i) {
    const int v = data[i];
    if (v < p->min_) p->min_ = v;
    if (v > p->max_) p->max_ = v;
    used_levels[v] = 1;
  }
  // Compute the mininum distance between two non-zero levels.
  p->min_level_dist_ = p->max_ - p->min_;
  last_level = -1;
  for (i = 0; i < 256; ++i) {
    if (used_levels[i]) {
      ++p->num_levels_;
      if (last_level >= 0) {
        const int level_dist = i - last_level;
        if (level_dist < p->min_level_dist_) {
          p->min_level_dist_ = level_dist;
        }
      }
      last_level = i;
    }
  }
 }
 // Initialize all params.
 static int InitParams(uint8_t* const data, int width, int height,
                      int radius, SmoothParams* const p) {
  const int R = 2 * radius + 1;  // total size of the kernel
  const size_t size_scratch_m = (R + 1) * width * sizeof(*p->start_);
  const size_t size_m =  width * sizeof(*p->average_);
  const size_t size_lut = (1 + 2 * LUT_SIZE) * sizeof(*p->correction_);
  const size_t total_size = size_scratch_m + size_m + size_lut;
  uint8_t* mem = (uint8_t*)malloc(1U * total_size);
  if (mem == NULL) return 0;
  p->mem_ = (void*)mem;
  p->start_ = (uint16_t*)mem;
  p->cur_ = p->start_;
  p->end_ = p->start_ + R * width;
  p->top_ = p->end_ - width;
  memset(p->top_, 0, width * sizeof(*p->top_));
  mem += size_scratch_m;
  p->average_ = (uint16_t*)mem;
  mem += size_m;
  p->width_ = width;
  p->height_ = height;
  p->src_ = data;
  p->dst_ = data;
  p->radius_ = radius;
  p->scale_ = (1 << (FIX + LFIX)) / (R * R);  // normalization constant
  p->row_ = -radius;
  // analyze the input distribution so we can best-fit the threshold
  CountLevels(data, width * height, p);
  // correction table
  p->correction_ = ((int16_t*)mem) + LUT_SIZE;
  InitCorrectionLUT(p->correction_, p->min_level_dist_);
  return 1;
 }
 static void CleanupParams(SmoothParams* const p) {
  free(p->mem_);
 }
 int WebPDequantizeLevels(uint8_t* const data, int width, int height,
                         int strength) {
  const int radius = 4 * strength / 100;
  if (strength < 0 || strength > 100) return 0;
  if (data == NULL || width <= 0 || height <= 0) return 0;  // bad params
  if (radius > 0) {
    SmoothParams p;
    memset(&p, 0, sizeof(p));
    if (!InitParams(data, width, height, radius, &p)) return 0;
    if (p.num_levels_ > 2) {
      for (; p.row_ < p.height_; ++p.row_) {
        VFilter(&p);  // accumulate average of input
        // Need to wait few rows in order to prime the filter,
        // before emitting some output.
        if (p.row_ >= p.radius_) {
          HFilter(&p);
          ApplyFilter(&p);
        }
      }
    }
    CleanupParams(&p);
  }
  return 1;
 }
--- a/src/loaders/webp/utils/quant_levels_dec.h
+++ b/src/loaders/webp/utils/quant_levels_dec.h
@ -0,0 +1,35 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Alpha plane de-quantization utility
 //
 // Author:  Vikas Arora (vikasa@google.com)
 #ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
 #define WEBP_UTILS_QUANT_LEVELS_DEC_H_
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Apply post-processing to input 'data' of size 'width'x'height' assuming that
 // the source was quantized to a reduced number of levels.
 // Strength is in [0..100] and controls the amount of dithering applied.
 // Returns false in case of error (data is NULL, invalid parameters,
 // malloc failure, ...).
 int WebPDequantizeLevels(uint8_t* const data, int width, int height,
                         int strength);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */
--- a/src/loaders/webp/utils/random.cpp
+++ b/src/loaders/webp/utils/random.cpp
@ -0,0 +1,43 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Pseudo-random utilities
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <string.h>
 #include "./random.h"
 //------------------------------------------------------------------------------
 // 31b-range values
 static const uint32_t kRandomTable[VP8_RANDOM_TABLE_SIZE] = {
  0x0de15230, 0x03b31886, 0x775faccb, 0x1c88626a, 0x68385c55, 0x14b3b828,
  0x4a85fef8, 0x49ddb84b, 0x64fcf397, 0x5c550289, 0x4a290000, 0x0d7ec1da,
  0x5940b7ab, 0x5492577d, 0x4e19ca72, 0x38d38c69, 0x0c01ee65, 0x32a1755f,
  0x5437f652, 0x5abb2c32, 0x0faa57b1, 0x73f533e7, 0x685feeda, 0x7563cce2,
  0x6e990e83, 0x4730a7ed, 0x4fc0d9c6, 0x496b153c, 0x4f1403fa, 0x541afb0c,
  0x73990b32, 0x26d7cb1c, 0x6fcc3706, 0x2cbb77d8, 0x75762f2a, 0x6425ccdd,
  0x24b35461, 0x0a7d8715, 0x220414a8, 0x141ebf67, 0x56b41583, 0x73e502e3,
  0x44cab16f, 0x28264d42, 0x73baaefb, 0x0a50ebed, 0x1d6ab6fb, 0x0d3ad40b,
  0x35db3b68, 0x2b081e83, 0x77ce6b95, 0x5181e5f0, 0x78853bbc, 0x009f9494,
  0x27e5ed3c
 };
 void VP8InitRandom(VP8Random* const rg, float dithering) {
  memcpy(rg->tab_, kRandomTable, sizeof(rg->tab_));
  rg->index1_ = 0;
  rg->index2_ = 31;
  rg->amp_ = (dithering < 0.0) ? 0
           : (dithering > 1.0) ? (1 << VP8_RANDOM_DITHER_FIX)
           : (uint32_t)((1 << VP8_RANDOM_DITHER_FIX) * dithering);
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/utils/random.h
+++ b/src/loaders/webp/utils/random.h
@ -0,0 +1,63 @@
 // Copyright 2013 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Pseudo-random utilities
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_UTILS_RANDOM_H_
 #define WEBP_UTILS_RANDOM_H_
 #include <assert.h>
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define VP8_RANDOM_DITHER_FIX 8   // fixed-point precision for dithering
 #define VP8_RANDOM_TABLE_SIZE 55
 typedef struct {
  int index1_, index2_;
  uint32_t tab_[VP8_RANDOM_TABLE_SIZE];
  int amp_;
 } VP8Random;
 // Initializes random generator with an amplitude 'dithering' in range [0..1].
 void VP8InitRandom(VP8Random* const rg, float dithering);
 // Returns a centered pseudo-random number with 'num_bits' amplitude.
 // (uses D.Knuth's Difference-based random generator).
 // 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
 static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
                                      int amp) {
  int diff;
  assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
  diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
  if (diff < 0) diff += (1u << 31);
  rg->tab_[rg->index1_] = diff;
  if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
  if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
  // sign-extend, 0-center
  diff = (int)((uint32_t)diff << 1) >> (32 - num_bits);
  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
  return diff;
 }
 static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
  return VP8RandomBits2(rg, num_bits, rg->amp_);
 }
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_RANDOM_H_ */
--- a/src/loaders/webp/utils/rescaler.cpp
+++ b/src/loaders/webp/utils/rescaler.cpp
@ -0,0 +1,82 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #include <assert.h>
 #include <stdlib.h>
 #include "../dsp/dsp.h"
 #include "./rescaler.h"
 //------------------------------------------------------------------------------
 void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
                      uint8_t* const dst, int dst_width, int dst_height,
                      int dst_stride, int num_channels, int x_add, int x_sub,
                      int y_add, int y_sub, int32_t* const work) {
  wrk->x_expand = (src_width < dst_width);
  wrk->src_width = src_width;
  wrk->src_height = src_height;
  wrk->dst_width = dst_width;
  wrk->dst_height = dst_height;
  wrk->dst = dst;
  wrk->dst_stride = dst_stride;
  wrk->num_channels = num_channels;
  // for 'x_expand', we use bilinear interpolation
  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
  wrk->y_accum = y_add;
  wrk->y_add = y_add;
  wrk->y_sub = y_sub;
  wrk->fx_scale = (1 << WEBP_RESCALER_RFIX) / x_sub;
  wrk->fy_scale = (1 << WEBP_RESCALER_RFIX) / y_sub;
  wrk->fxy_scale = wrk->x_expand ?
      ((int64_t)dst_height << WEBP_RESCALER_RFIX) / (x_sub * src_height) :
      ((int64_t)dst_height << WEBP_RESCALER_RFIX) / (x_add * src_height);
  wrk->irow = work;
  wrk->frow = work + num_channels * dst_width;
  WebPRescalerDspInit();
 }
 //------------------------------------------------------------------------------
 // all-in-one calls
 int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
  return (num_lines > max_num_lines) ? max_num_lines : num_lines;
 }
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                       const uint8_t* src, int src_stride) {
  int total_imported = 0;
  while (total_imported < num_lines && wrk->y_accum > 0) {
    int channel;
    for (channel = 0; channel < wrk->num_channels; ++channel) {
      WebPRescalerImportRow(wrk, src, channel);
    }
    src += src_stride;
    ++total_imported;
    wrk->y_accum -= wrk->y_sub;
  }
  return total_imported;
 }
 int WebPRescalerExport(WebPRescaler* const rescaler) {
  int total_exported = 0;
  while (WebPRescalerHasPendingOutput(rescaler)) {
    WebPRescalerExportRow(rescaler, 0);
    ++total_exported;
  }
  return total_exported;
 }
 //------------------------------------------------------------------------------
--- a/src/loaders/webp/utils/rescaler.h
+++ b/src/loaders/webp/utils/rescaler.h
@ -0,0 +1,78 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Rescaling functions
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_UTILS_RESCALER_H_
 #define WEBP_UTILS_RESCALER_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
 #include "../webp/types.h"
 #define WEBP_RESCALER_RFIX 30   // fixed-point precision for multiplies
 // Structure used for on-the-fly rescaling
 typedef struct WebPRescaler WebPRescaler;
 struct WebPRescaler {
  int x_expand;               // true if we're expanding in the x direction
  int num_channels;           // bytes to jump between pixels
  int fy_scale, fx_scale;     // fixed-point scaling factor
  int64_t fxy_scale;          // ''
  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
  int y_accum;                // vertical accumulator
  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
  int src_width, src_height;  // source dimensions
  int dst_width, dst_height;  // destination dimensions
  uint8_t* dst;
  int dst_stride;
  int32_t* irow, *frow;       // work buffer
 };
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
 void WebPRescalerInit(WebPRescaler* const rescaler,
                      int src_width, int src_height,
                      uint8_t* const dst,
                      int dst_width, int dst_height, int dst_stride,
                      int num_channels,
                      int x_add, int x_sub,
                      int y_add, int y_sub,
                      int32_t* const work);
 // Returns the number of input lines needed next to produce one output line,
 // considering that the maximum available input lines are 'max_num_lines'.
 int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
                           int max_num_lines);
 // Import multiple rows over all channels, until at least one row is ready to
 // be exported. Returns the actual number of lines that were imported.
 int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
                       const uint8_t* src, int src_stride);
 // Return true if there is pending output rows ready.
 static WEBP_INLINE
 int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
  return (rescaler->y_accum <= 0);
 }
 // Export as many rows as possible. Return the numbers of rows written.
 int WebPRescalerExport(WebPRescaler* const rescaler);
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_RESCALER_H_ */
--- a/src/loaders/webp/utils/utils.h
+++ b/src/loaders/webp/utils/utils.h
@ -0,0 +1,68 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 // Misc. common utility functions
 //
 // Authors: Skal (pascal.massimino@gmail.com)
 //          Urvang (urvang@google.com)
 #ifndef WEBP_UTILS_UTILS_H_
 #define WEBP_UTILS_UTILS_H_
 #include <assert.h>
 #include "../webp/types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 //------------------------------------------------------------------------------
 // Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
  return 31 ^ __builtin_clz(n);
 }
 #elif defined(_MSC_VER) && _MSC_VER > 1310 && \
      (defined(_M_X64) || defined(_M_IX86))
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
  uint32_t first_set_bit;
  _BitScanReverse((unsigned long*)&first_set_bit, n);
  return first_set_bit;
 }
 #else
 static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
  int log = 0;
  uint32_t value = n;
  int i;
  for (i = 4; i >= 0; --i) {
    const int shift = (1 << i);
    const uint32_t x = value >> shift;
    if (x != 0) {
      value = x;
      log += shift;
    }
  }
  return log;
 }
 #endif
 //------------------------------------------------------------------------------
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_UTILS_UTILS_H_ */
--- a/src/loaders/webp/webp/decode.h
+++ b/src/loaders/webp/webp/decode.h
@ -0,0 +1,493 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Main decoding functions for WebP images.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_WEBP_DECODE_H_
 #define WEBP_WEBP_DECODE_H_
 #include "./types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 #define WEBP_DECODER_ABI_VERSION 0x0205    // MAJOR(8b) + MINOR(8b)
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
 // typedef enum VP8StatusCode VP8StatusCode;
 // typedef enum WEBP_CSP_MODE WEBP_CSP_MODE;
 typedef struct WebPRGBABuffer WebPRGBABuffer;
 typedef struct WebPYUVABuffer WebPYUVABuffer;
 typedef struct WebPDecBuffer WebPDecBuffer;
 typedef struct WebPIDecoder WebPIDecoder;
 typedef struct WebPBitstreamFeatures WebPBitstreamFeatures;
 typedef struct WebPDecoderOptions WebPDecoderOptions;
 typedef struct WebPDecoderConfig WebPDecoderConfig;
 // Return the decoder's version number, packed in hexadecimal using 8bits for
 // each of major/minor/revision. E.g: v2.5.7 is 0x020507.
 WEBP_EXTERN(int) WebPGetDecoderVersion(void);
 // Retrieve basic header information: width, height.
 // This function will also validate the header and return 0 in
 // case of formatting error.
 // Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
 WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size,
                             int* width, int* height);
 // Decodes WebP images pointed to by 'data' and returns RGBA samples, along
 // with the dimensions in *width and *height. The ordering of samples in
 // memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
 // The returned pointer should be deleted calling free().
 // Returns NULL in case of error.
 WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size,
                                     int* width, int* height);
 // Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
 WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size,
                                     int* width, int* height);
 // Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
 WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size,
                                     int* width, int* height);
 // Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
 // If the bitstream contains transparency, it is ignored.
 WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size,
                                    int* width, int* height);
 // Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
 WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
                                    int* width, int* height);
 // Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
 // returned is the Y samples buffer. Upon return, *u and *v will point to
 // the U and V chroma data. These U and V buffers need NOT be free()'d,
 // unlike the returned Y luma one. The dimension of the U and V planes
 // are both (*width + 1) / 2 and (*height + 1)/ 2.
 // Upon return, the Y buffer has a stride returned as '*stride', while U and V
 // have a common stride returned as '*uv_stride'.
 // Return NULL in case of error.
 // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
 WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size,
                                    int* width, int* height,
                                    uint8_t** u, uint8_t** v,
                                    int* stride, int* uv_stride);
 // These five functions are variants of the above ones, that decode the image
 // directly into a pre-allocated buffer 'output_buffer'. The maximum storage
 // available in this buffer is indicated by 'output_buffer_size'. If this
 // storage is not sufficient (or an error occurred), NULL is returned.
 // Otherwise, output_buffer is returned, for convenience.
 // The parameter 'output_stride' specifies the distance (in bytes)
 // between scanlines. Hence, output_buffer_size is expected to be at least
 // output_stride x picture-height.
 WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
    const uint8_t* data, size_t data_size,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
    const uint8_t* data, size_t data_size,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
    const uint8_t* data, size_t data_size,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 // RGB and BGR variants. Here too the transparency information, if present,
 // will be dropped and ignored.
 WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
    const uint8_t* data, size_t data_size,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
    const uint8_t* data, size_t data_size,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 // WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly
 // into pre-allocated luma/chroma plane buffers. This function requires the
 // strides to be passed: one for the luma plane and one for each of the
 // chroma ones. The size of each plane buffer is passed as 'luma_size',
 // 'u_size' and 'v_size' respectively.
 // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
 // during decoding (or because some buffers were found to be too small).
 WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
    const uint8_t* data, size_t data_size,
    uint8_t* luma, size_t luma_size, int luma_stride,
    uint8_t* u, size_t u_size, int u_stride,
    uint8_t* v, size_t v_size, int v_stride);
 //------------------------------------------------------------------------------
 // Output colorspaces and buffer
 // Colorspaces
 // Note: the naming describes the byte-ordering of packed samples in memory.
 // For instance, MODE_BGRA relates to samples ordered as B,G,R,A,B,G,R,A,...
 // Non-capital names (e.g.:MODE_Argb) relates to pre-multiplied RGB channels.
 // RGBA-4444 and RGB-565 colorspaces are represented by following byte-order:
 // RGBA-4444: [r3 r2 r1 r0 g3 g2 g1 g0], [b3 b2 b1 b0 a3 a2 a1 a0], ...
 // RGB-565: [r4 r3 r2 r1 r0 g5 g4 g3], [g2 g1 g0 b4 b3 b2 b1 b0], ...
 // In the case WEBP_SWAP_16BITS_CSP is defined, the bytes are swapped for
 // these two modes:
 // RGBA-4444: [b3 b2 b1 b0 a3 a2 a1 a0], [r3 r2 r1 r0 g3 g2 g1 g0], ...
 // RGB-565: [g2 g1 g0 b4 b3 b2 b1 b0], [r4 r3 r2 r1 r0 g5 g4 g3], ...
 typedef enum WEBP_CSP_MODE {
  MODE_RGB = 0, MODE_RGBA = 1,
  MODE_BGR = 2, MODE_BGRA = 3,
  MODE_ARGB = 4, MODE_RGBA_4444 = 5,
  MODE_RGB_565 = 6,
  // RGB-premultiplied transparent modes (alpha value is preserved)
  MODE_rgbA = 7,
  MODE_bgrA = 8,
  MODE_Argb = 9,
  MODE_rgbA_4444 = 10,
  // YUV modes must come after RGB ones.
  MODE_YUV = 11, MODE_YUVA = 12,  // yuv 4:2:0
  MODE_LAST = 13
 } WEBP_CSP_MODE;
 // Some useful macros:
 static WEBP_INLINE int WebPIsPremultipliedMode(WEBP_CSP_MODE mode) {
  return (mode == MODE_rgbA || mode == MODE_bgrA || mode == MODE_Argb ||
          mode == MODE_rgbA_4444);
 }
 static WEBP_INLINE int WebPIsAlphaMode(WEBP_CSP_MODE mode) {
  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
          mode == MODE_RGBA_4444 || mode == MODE_YUVA ||
          WebPIsPremultipliedMode(mode));
 }
 static WEBP_INLINE int WebPIsRGBMode(WEBP_CSP_MODE mode) {
  return (mode < MODE_YUV);
 }
 //------------------------------------------------------------------------------
 // WebPDecBuffer: Generic structure for describing the output sample buffer.
 struct WebPRGBABuffer {    // view as RGBA
  uint8_t* rgba;    // pointer to RGBA samples
  int stride;       // stride in bytes from one scanline to the next.
  size_t size;      // total size of the *rgba buffer.
 };
 struct WebPYUVABuffer {              // view as YUVA
  uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
  int y_stride;               // luma stride
  int u_stride, v_stride;     // chroma strides
  int a_stride;               // alpha stride
  size_t y_size;              // luma plane size
  size_t u_size, v_size;      // chroma planes size
  size_t a_size;              // alpha-plane size
 };
 // Output buffer
 struct WebPDecBuffer {
  WEBP_CSP_MODE colorspace;  // Colorspace.
  int width, height;         // Dimensions.
  int is_external_memory;    // If true, 'internal_memory' pointer is not used.
  union {
    WebPRGBABuffer RGBA;
    WebPYUVABuffer YUVA;
  } u;                       // Nameless union of buffer parameters.
  uint32_t       pad[4];     // padding for later use
  uint8_t* private_memory;   // Internally allocated memory (only when
                             // is_external_memory is false). Should not be used
                             // externally, but accessed via the buffer union.
 };
 // Internal, version-checked, entry point
 WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
 // Initialize the structure as empty. Must be called before any other use.
 // Returns false in case of version mismatch
 static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
  return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
 }
 // Free any memory associated with the buffer. Must always be called last.
 // Note: doesn't free the 'buffer' structure itself.
 WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
 //------------------------------------------------------------------------------
 // Enumeration of the status codes
 typedef enum VP8StatusCode {
  VP8_STATUS_OK = 0,
  VP8_STATUS_OUT_OF_MEMORY,
  VP8_STATUS_INVALID_PARAM,
  VP8_STATUS_BITSTREAM_ERROR,
  VP8_STATUS_UNSUPPORTED_FEATURE,
  VP8_STATUS_SUSPENDED,
  VP8_STATUS_USER_ABORT,
  VP8_STATUS_NOT_ENOUGH_DATA
 } VP8StatusCode;
 //------------------------------------------------------------------------------
 // Incremental decoding
 //
 // This API allows streamlined decoding of partial data.
 // Picture can be incrementally decoded as data become available thanks to the
 // WebPIDecoder object. This object can be left in a SUSPENDED state if the
 // picture is only partially decoded, pending additional input.
 // Code example:
 //
 //   WebPInitDecBuffer(&buffer);
 //   buffer.colorspace = mode;
 //   ...
 //   WebPIDecoder* idec = WebPINewDecoder(&buffer);
 //   while (has_more_data) {
 //     // ... (get additional data)
 //     status = WebPIAppend(idec, new_data, new_data_size);
 //     if (status != VP8_STATUS_SUSPENDED ||
 //       break;
 //     }
 //
 //     // The above call decodes the current available buffer.
 //     // Part of the image can now be refreshed by calling to
 //     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
 //   }
 //   WebPIDelete(idec);
 // Creates a new incremental decoder with the supplied buffer parameter.
 // This output_buffer can be passed NULL, in which case a default output buffer
 // is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
 // is kept, which means that the lifespan of 'output_buffer' must be larger than
 // that of the returned WebPIDecoder object.
 // The supplied 'output_buffer' content MUST NOT be changed between calls to
 // WebPIAppend() or WebPIUpdate() unless 'output_buffer.is_external_memory' is
 // set to 1. In such a case, it is allowed to modify the pointers, size and
 // stride of output_buffer.u.RGBA or output_buffer.u.YUVA, provided they remain
 // within valid bounds.
 // All other fields of WebPDecBuffer MUST remain constant between calls.
 // Returns NULL if the allocation failed.
 WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
 // This function allocates and initializes an incremental-decoder object, which
 // will output the RGB/A samples specified by 'csp' into a preallocated
 // buffer 'output_buffer'. The size of this buffer is at least
 // 'output_buffer_size' and the stride (distance in bytes between two scanlines)
 // is specified by 'output_stride'.
 // Additionally, output_buffer can be passed NULL in which case the output
 // buffer will be allocated automatically when the decoding starts. The
 // colorspace 'csp' is taken into account for allocating this buffer. All other
 // parameters are ignored.
 // Returns NULL if the allocation failed, or if some parameters are invalid.
 WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
    WEBP_CSP_MODE csp,
    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
 // This function allocates and initializes an incremental-decoder object, which
 // will output the raw luma/chroma samples into a preallocated planes if
 // supplied. The luma plane is specified by its pointer 'luma', its size
 // 'luma_size' and its stride 'luma_stride'. Similarly, the chroma-u plane
 // is specified by the 'u', 'u_size' and 'u_stride' parameters, and the chroma-v
 // plane by 'v' and 'v_size'. And same for the alpha-plane. The 'a' pointer
 // can be pass NULL in case one is not interested in the transparency plane.
 // Conversely, 'luma' can be passed NULL if no preallocated planes are supplied.
 // In this case, the output buffer will be automatically allocated (using
 // MODE_YUVA) when decoding starts. All parameters are then ignored.
 // Returns NULL if the allocation failed or if a parameter is invalid.
 WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
    uint8_t* luma, size_t luma_size, int luma_stride,
    uint8_t* u, size_t u_size, int u_stride,
    uint8_t* v, size_t v_size, int v_stride,
    uint8_t* a, size_t a_size, int a_stride);
 // Deprecated version of the above, without the alpha plane.
 // Kept for backward compatibility.
 WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
    uint8_t* luma, size_t luma_size, int luma_stride,
    uint8_t* u, size_t u_size, int u_stride,
    uint8_t* v, size_t v_size, int v_stride);
 // Deletes the WebPIDecoder object and associated memory. Must always be called
 // if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded.
 WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec);
 // Copies and decodes the next available data. Returns VP8_STATUS_OK when
 // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
 // data is expected. Returns error in other cases.
 WEBP_EXTERN(VP8StatusCode) WebPIAppend(
    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 // A variant of the above function to be used when data buffer contains
 // partial data from the beginning. In this case data buffer is not copied
 // to the internal memory.
 // Note that the value of the 'data' pointer can change between calls to
 // WebPIUpdate, for instance when the data buffer is resized to fit larger data.
 WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
 // Returns the RGB/A image decoded so far. Returns NULL if output params
 // are not initialized yet. The RGB/A output type corresponds to the colorspace
 // specified during call to WebPINewDecoder() or WebPINewRGB().
 // *last_y is the index of last decoded row in raster scan order. Some pointers
 // (*last_y, *width etc.) can be NULL if corresponding information is not
 // needed.
 WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
    const WebPIDecoder* idec, int* last_y,
    int* width, int* height, int* stride);
 // Same as above function to get a YUVA image. Returns pointer to the luma
 // plane or NULL in case of error. If there is no alpha information
 // the alpha pointer '*a' will be returned NULL.
 WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
    const WebPIDecoder* idec, int* last_y,
    uint8_t** u, uint8_t** v, uint8_t** a,
    int* width, int* height, int* stride, int* uv_stride, int* a_stride);
 // Deprecated alpha-less version of WebPIDecGetYUVA(): it will ignore the
 // alpha information (if present). Kept for backward compatibility.
 static WEBP_INLINE uint8_t* WebPIDecGetYUV(
    const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v,
    int* width, int* height, int* stride, int* uv_stride) {
  return WebPIDecGetYUVA(idec, last_y, u, v, NULL, width, height,
                         stride, uv_stride, NULL);
 }
 // Generic call to retrieve information about the displayable area.
 // If non NULL, the left/right/width/height pointers are filled with the visible
 // rectangular area so far.
 // Returns NULL in case the incremental decoder object is in an invalid state.
 // Otherwise returns the pointer to the internal representation. This structure
 // is read-only, tied to WebPIDecoder's lifespan and should not be modified.
 WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
    const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
 //------------------------------------------------------------------------------
 // Advanced decoding parametrization
 //
 //  Code sample for using the advanced decoding API
 /*
     // A) Init a configuration object
     WebPDecoderConfig config;
     CHECK(WebPInitDecoderConfig(&config));
     // B) optional: retrieve the bitstream's features.
     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
     // C) Adjust 'config', if needed
     config.no_fancy_upsampling = 1;
     config.output.colorspace = MODE_BGRA;
     // etc.
     // Note that you can also make config.output point to an externally
     // supplied memory buffer, provided it's big enough to store the decoded
     // picture. Otherwise, config.output will just be used to allocate memory
     // and store the decoded picture.
     // D) Decode!
     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
     // E) Decoded image is now in config.output (and config.output.u.RGBA)
     // F) Reclaim memory allocated in config's object. It's safe to call
     // this function even if the memory is external and wasn't allocated
     // by WebPDecode().
     WebPFreeDecBuffer(&config.output);
 */
 // Features gathered from the bitstream
 struct WebPBitstreamFeatures {
  int width;          // Width in pixels, as read from the bitstream.
  int height;         // Height in pixels, as read from the bitstream.
  int has_alpha;      // True if the bitstream contains an alpha channel.
  int has_animation;  // True if the bitstream is an animation.
  int format;         // 0 = undefined (/mixed), 1 = lossy, 2 = lossless
  // Unused for now:
  int no_incremental_decoding;  // if true, using incremental decoding is not
                                // recommended.
  int rotate;                   // TODO(later)
  int uv_sampling;              // should be 0 for now. TODO(later)
  uint32_t pad[2];              // padding for later use
 };
 // Internal, version-checked, entry point
 WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
    const uint8_t*, size_t, WebPBitstreamFeatures*, int);
 // Retrieve features from the bitstream. The *features structure is filled
 // with information gathered from the bitstream.
 // Returns VP8_STATUS_OK when the features are successfully retrieved. Returns
 // VP8_STATUS_NOT_ENOUGH_DATA when more data is needed to retrieve the
 // features from headers. Returns error in other cases.
 static WEBP_INLINE VP8StatusCode WebPGetFeatures(
    const uint8_t* data, size_t data_size,
    WebPBitstreamFeatures* features) {
  return WebPGetFeaturesInternal(data, data_size, features,
                                 WEBP_DECODER_ABI_VERSION);
 }
 // Decoding options
 struct WebPDecoderOptions {
  int bypass_filtering;               // if true, skip the in-loop filtering
  int no_fancy_upsampling;            // if true, use faster pointwise upsampler
  int use_cropping;                   // if true, cropping is applied _first_
  int crop_left, crop_top;            // top-left position for cropping.
                                      // Will be snapped to even values.
  int crop_width, crop_height;        // dimension of the cropping area
  int use_scaling;                    // if true, scaling is applied _afterward_
  int scaled_width, scaled_height;    // final resolution
  int use_threads;                    // if true, use multi-threaded decoding
  int dithering_strength;             // dithering strength (0=Off, 100=full)
  int flip;                           // flip output vertically
  int alpha_dithering_strength;       // alpha dithering strength in [0..100]
  // Unused for now:
  int force_rotation;                 // forced rotation (to be applied _last_)
  int no_enhancement;                 // if true, discard enhancement layer
  uint32_t pad[3];                    // padding for later use
 };
 // Main object storing the configuration for advanced decoding.
 struct WebPDecoderConfig {
  WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
  WebPDecBuffer output;         // Output buffer (can point to external mem)
  WebPDecoderOptions options;   // Decoding options
 };
 // Internal, version-checked, entry point
 WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
 // Initialize the configuration as empty. This function must always be
 // called first, unless WebPGetFeatures() is to be called.
 // Returns false in case of mismatched version.
 static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
  return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
 }
 // Instantiate a new incremental decoder object with the requested
 // configuration. The bitstream can be passed using 'data' and 'data_size'
 // parameter, in which case the features will be parsed and stored into
 // config->input. Otherwise, 'data' can be NULL and no parsing will occur.
 // Note that 'config' can be NULL too, in which case a default configuration
 // is used.
 // The return WebPIDecoder object must always be deleted calling WebPIDelete().
 // Returns NULL in case of error (and config->status will then reflect
 // the error condition).
 WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
                                       WebPDecoderConfig* config);
 // Non-incremental version. This version decodes the full data at once, taking
 // 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
 // if the decoding was successful).
 WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
                                      WebPDecoderConfig* config);
 #ifdef __cplusplus
 }    // extern "C"
 #endif
 #endif  /* WEBP_WEBP_DECODE_H_ */
--- a/src/loaders/webp/webp/format_constants.h
+++ b/src/loaders/webp/webp/format_constants.h
@ -0,0 +1,88 @@
 // Copyright 2012 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Internal header for constants related to WebP file format.
 //
 // Author: Urvang (urvang@google.com)
 #ifndef WEBP_WEBP_FORMAT_CONSTANTS_H_
 #define WEBP_WEBP_FORMAT_CONSTANTS_H_
 // Create fourcc of the chunk from the chunk tag characters.
 #define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
 // VP8 related constants.
 #define VP8_SIGNATURE 0x9d012a              // Signature in VP8 data.
 #define VP8_MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
 #define VP8_MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition
 #define VP8_FRAME_HEADER_SIZE 10  // Size of the frame header within VP8 data.
 // VP8L related constants.
 #define VP8L_SIGNATURE_SIZE          1      // VP8L signature size.
 #define VP8L_MAGIC_BYTE              0x2f   // VP8L signature byte.
 #define VP8L_IMAGE_SIZE_BITS         14     // Number of bits used to store
                                            // width and height.
 #define VP8L_VERSION_BITS            3      // 3 bits reserved for version.
 #define VP8L_VERSION                 0      // version 0
 #define VP8L_FRAME_HEADER_SIZE       5      // Size of the VP8L frame header.
 #define MAX_PALETTE_SIZE             256
 #define MAX_CACHE_BITS               11
 #define HUFFMAN_CODES_PER_META_CODE  5
 #define ARGB_BLACK                   0xff000000
 #define DEFAULT_CODE_LENGTH          8
 #define MAX_ALLOWED_CODE_LENGTH      15
 #define NUM_LITERAL_CODES            256
 #define NUM_LENGTH_CODES             24
 #define NUM_DISTANCE_CODES           40
 #define CODE_LENGTH_CODES            19
 #define MIN_HUFFMAN_BITS             2  // min number of Huffman bits
 #define MAX_HUFFMAN_BITS             9  // max number of Huffman bits
 #define TRANSFORM_PRESENT            1  // The bit to be written when next data
                                        // to be read is a transform.
 #define NUM_TRANSFORMS               4  // Maximum number of allowed transform
                                        // in a bitstream.
 typedef enum {
  PREDICTOR_TRANSFORM      = 0,
  CROSS_COLOR_TRANSFORM    = 1,
  SUBTRACT_GREEN           = 2,
  COLOR_INDEXING_TRANSFORM = 3
 } VP8LImageTransformType;
 // Alpha related constants.
 #define ALPHA_HEADER_LEN            1
 #define ALPHA_NO_COMPRESSION        0
 #define ALPHA_LOSSLESS_COMPRESSION  1
 #define ALPHA_PREPROCESSED_LEVELS   1
 // Mux related constants.
 #define TAG_SIZE           4     // Size of a chunk tag (e.g. "VP8L").
 #define CHUNK_SIZE_BYTES   4     // Size needed to store chunk's size.
 #define CHUNK_HEADER_SIZE  8     // Size of a chunk header.
 #define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
 #define ANMF_CHUNK_SIZE    16    // Size of an ANMF chunk.
 #define ANIM_CHUNK_SIZE    6     // Size of an ANIM chunk.
 #define FRGM_CHUNK_SIZE    6     // Size of a FRGM chunk.
 #define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
 #define MAX_CANVAS_SIZE     (1 << 24)     // 24-bit max for VP8X width/height.
 #define MAX_IMAGE_AREA      (1ULL << 32)  // 32-bit max for width x height.
 #define MAX_LOOP_COUNT      (1 << 16)     // maximum value for loop-count
 #define MAX_DURATION        (1 << 24)     // maximum duration
 #define MAX_POSITION_OFFSET (1 << 24)     // maximum frame/fragment x/y offset
 // Maximum chunk payload is such that adding the header and padding won't
 // overflow a uint32_t.
 #define MAX_CHUNK_PAYLOAD (~0U - CHUNK_HEADER_SIZE - 1)
 #endif  /* WEBP_WEBP_FORMAT_CONSTANTS_H_ */
--- a/src/loaders/webp/webp/meson.build
+++ b/src/loaders/webp/webp/meson.build
@ -0,0 +1,10 @@
 source_file = [
   'decode.h',
   'format_constants.h',
   'types.h'
 ]
 webp_deb += [declare_dependency(
    include_directories : include_directories('.'),
    sources : source_file
 )]
--- a/src/loaders/webp/webp/types.h
+++ b/src/loaders/webp/webp/types.h
@ -0,0 +1,53 @@
 // Copyright 2010 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
 // tree. An additional intellectual property rights grant can be found
 // in the file PATENTS. All contributing project authors may
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
 //  Common types
 //
 // Author: Skal (pascal.massimino@gmail.com)
 #ifndef WEBP_WEBP_TYPES_H_
 #define WEBP_WEBP_TYPES_H_
 #include <stddef.h>  // for size_t
 #ifndef _MSC_VER
 #include <inttypes.h>
 #if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
 #define WEBP_INLINE inline
 #else
 #define WEBP_INLINE
 #endif
 #else
 typedef signed   char int8_t;
 typedef unsigned char uint8_t;
 typedef signed   short int16_t;
 typedef unsigned short uint16_t;
 typedef signed   int int32_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long int uint64_t;
 typedef long long int int64_t;
 #define WEBP_INLINE __forceinline
 #endif  /* _MSC_VER */
 #ifndef WEBP_EXTERN
 // This explicitly marks library functions and allows for changing the
 // signature for e.g., Windows DLL builds.
 # if defined(__GNUC__) && __GNUC__ >= 4
 #  define WEBP_EXTERN(type) extern __attribute__ ((visibility ("default"))) type
 # else
 #  define WEBP_EXTERN(type) extern type
 # endif  /* __GNUC__ >= 4 */
 #endif  /* WEBP_EXTERN */
 // Macro to check ABI compatibility (same major revision number)
 #define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
 #endif  /* WEBP_WEBP_TYPES_H_ */