From ce7f4cb7c33e06c8cb6e190a5901c219202cd731 Mon Sep 17 00:00:00 2001
From: Junsu Choi <jsuya.choi@samsung.com>
Date: Tue, 6 Feb 2024 12:09:29 +0900
Subject: [PATCH] loader/webp: Support static webp loader

Built to libwebp code in to support the static option of webp loader.
Code from :https://github.com/jacklicn/libwebp

This forking version only contains the minimal webp decoding feature.

Binary Size: +68kb

Co-authored-by: Hermet Park <hermet@lottiefiles.com>

issue: https://github.com/thorvg/thorvg/issues/1427
---
 .github/workflows/build_ubuntu.yml          |    2 +-
 .github/workflows/codeql.yml                |    2 +-
 meson.build                                 |   54 +-
 meson_options.txt                           |    4 +-
 src/examples/PictureWebp.cpp                |    1 -
 src/loaders/external_webp/meson.build       |   14 +-
 src/loaders/meson.build                     |    7 +-
 src/loaders/webp/dec/alpha.cpp              |  167 ++
 src/loaders/webp/dec/alphai.h               |   55 +
 src/loaders/webp/dec/buffer.cpp             |  238 +++
 src/loaders/webp/dec/common.h               |   54 +
 src/loaders/webp/dec/decode_vp8.h           |  177 +++
 src/loaders/webp/dec/frame.cpp              |  713 +++++++++
 src/loaders/webp/dec/io.cpp                 |  392 +++++
 src/loaders/webp/dec/meson.build            |   22 +
 src/loaders/webp/dec/quant.cpp              |  110 ++
 src/loaders/webp/dec/tree.cpp               |  525 ++++++
 src/loaders/webp/dec/vp8.cpp                |  681 ++++++++
 src/loaders/webp/dec/vp8i.h                 |  307 ++++
 src/loaders/webp/dec/vp8l.cpp               | 1580 +++++++++++++++++++
 src/loaders/webp/dec/vp8li.h                |  136 ++
 src/loaders/webp/dec/webp.cpp               |  674 ++++++++
 src/loaders/webp/dec/webpi.h                |  108 ++
 src/loaders/webp/dsp/alpha_processing.cpp   |  377 +++++
 src/loaders/webp/dsp/argb.cpp               |   68 +
 src/loaders/webp/dsp/cpu.cpp                |  120 ++
 src/loaders/webp/dsp/dec.cpp                |  766 +++++++++
 src/loaders/webp/dsp/dec_clip_tables.cpp    |  366 +++++
 src/loaders/webp/dsp/dsp.h                  |  319 ++++
 src/loaders/webp/dsp/filters.cpp            |  240 +++
 src/loaders/webp/dsp/lossless.cpp           |  636 ++++++++
 src/loaders/webp/dsp/lossless.h             |  303 ++++
 src/loaders/webp/dsp/meson.build            |   20 +
 src/loaders/webp/dsp/rescaler.cpp           |  115 ++
 src/loaders/webp/dsp/upsampling.cpp         |  252 +++
 src/loaders/webp/dsp/yuv.cpp                |  166 ++
 src/loaders/webp/dsp/yuv.h                  |  321 ++++
 src/loaders/webp/meson.build                |   17 +
 src/loaders/webp/tvgWebpLoader.cpp          |  150 ++
 src/loaders/webp/tvgWebpLoader.h            |   48 +
 src/loaders/webp/utils/bit_reader.cpp       |  216 +++
 src/loaders/webp/utils/bit_reader.h         |  168 ++
 src/loaders/webp/utils/bit_reader_inl.h     |  172 ++
 src/loaders/webp/utils/color_cache.cpp      |   48 +
 src/loaders/webp/utils/color_cache.h        |   74 +
 src/loaders/webp/utils/endian_inl.h         |  100 ++
 src/loaders/webp/utils/huffman.cpp          |  204 +++
 src/loaders/webp/utils/huffman.h            |   67 +
 src/loaders/webp/utils/meson.build          |   22 +
 src/loaders/webp/utils/quant_levels_dec.cpp |  279 ++++
 src/loaders/webp/utils/quant_levels_dec.h   |   35 +
 src/loaders/webp/utils/random.cpp           |   43 +
 src/loaders/webp/utils/random.h             |   63 +
 src/loaders/webp/utils/rescaler.cpp         |   82 +
 src/loaders/webp/utils/rescaler.h           |   78 +
 src/loaders/webp/utils/utils.h              |   68 +
 src/loaders/webp/webp/decode.h              |  493 ++++++
 src/loaders/webp/webp/format_constants.h    |   88 ++
 src/loaders/webp/webp/meson.build           |   10 +
 src/loaders/webp/webp/types.h               |   53 +
 60 files changed, 12630 insertions(+), 40 deletions(-)
 create mode 100644 src/loaders/webp/dec/alpha.cpp
 create mode 100644 src/loaders/webp/dec/alphai.h
 create mode 100644 src/loaders/webp/dec/buffer.cpp
 create mode 100644 src/loaders/webp/dec/common.h
 create mode 100644 src/loaders/webp/dec/decode_vp8.h
 create mode 100644 src/loaders/webp/dec/frame.cpp
 create mode 100644 src/loaders/webp/dec/io.cpp
 create mode 100644 src/loaders/webp/dec/meson.build
 create mode 100644 src/loaders/webp/dec/quant.cpp
 create mode 100644 src/loaders/webp/dec/tree.cpp
 create mode 100644 src/loaders/webp/dec/vp8.cpp
 create mode 100644 src/loaders/webp/dec/vp8i.h
 create mode 100644 src/loaders/webp/dec/vp8l.cpp
 create mode 100644 src/loaders/webp/dec/vp8li.h
 create mode 100644 src/loaders/webp/dec/webp.cpp
 create mode 100644 src/loaders/webp/dec/webpi.h
 create mode 100644 src/loaders/webp/dsp/alpha_processing.cpp
 create mode 100644 src/loaders/webp/dsp/argb.cpp
 create mode 100644 src/loaders/webp/dsp/cpu.cpp
 create mode 100644 src/loaders/webp/dsp/dec.cpp
 create mode 100644 src/loaders/webp/dsp/dec_clip_tables.cpp
 create mode 100644 src/loaders/webp/dsp/dsp.h
 create mode 100644 src/loaders/webp/dsp/filters.cpp
 create mode 100644 src/loaders/webp/dsp/lossless.cpp
 create mode 100644 src/loaders/webp/dsp/lossless.h
 create mode 100644 src/loaders/webp/dsp/meson.build
 create mode 100644 src/loaders/webp/dsp/rescaler.cpp
 create mode 100644 src/loaders/webp/dsp/upsampling.cpp
 create mode 100644 src/loaders/webp/dsp/yuv.cpp
 create mode 100644 src/loaders/webp/dsp/yuv.h
 create mode 100644 src/loaders/webp/meson.build
 create mode 100644 src/loaders/webp/tvgWebpLoader.cpp
 create mode 100644 src/loaders/webp/tvgWebpLoader.h
 create mode 100644 src/loaders/webp/utils/bit_reader.cpp
 create mode 100644 src/loaders/webp/utils/bit_reader.h
 create mode 100644 src/loaders/webp/utils/bit_reader_inl.h
 create mode 100644 src/loaders/webp/utils/color_cache.cpp
 create mode 100644 src/loaders/webp/utils/color_cache.h
 create mode 100644 src/loaders/webp/utils/endian_inl.h
 create mode 100644 src/loaders/webp/utils/huffman.cpp
 create mode 100644 src/loaders/webp/utils/huffman.h
 create mode 100644 src/loaders/webp/utils/meson.build
 create mode 100644 src/loaders/webp/utils/quant_levels_dec.cpp
 create mode 100644 src/loaders/webp/utils/quant_levels_dec.h
 create mode 100644 src/loaders/webp/utils/random.cpp
 create mode 100644 src/loaders/webp/utils/random.h
 create mode 100644 src/loaders/webp/utils/rescaler.cpp
 create mode 100644 src/loaders/webp/utils/rescaler.h
 create mode 100644 src/loaders/webp/utils/utils.h
 create mode 100644 src/loaders/webp/webp/decode.h
 create mode 100644 src/loaders/webp/webp/format_constants.h
 create mode 100644 src/loaders/webp/webp/meson.build
 create mode 100644 src/loaders/webp/webp/types.h

diff --git a/.github/workflows/build_ubuntu.yml b/.github/workflows/build_ubuntu.yml
index 7895429d..19d52b17 100644
--- a/.github/workflows/build_ubuntu.yml
+++ b/.github/workflows/build_ubuntu.yml
@@ -87,7 +87,7 @@ jobs:
     - name: Build & Run memcheck Script(ASAN)
       run: |
         sudo rm -rf ./build
-        meson setup build -Db_sanitize="address,undefined" -Dloaders="all, webp_beta" -Dsavers="all" -Dtests="true" -Dbindings="capi"
+        meson setup build -Db_sanitize="address,undefined" -Dloaders="all" -Dsavers="all" -Dtests="true" -Dbindings="capi"
         sudo ninja -C build install
         export PATH=$PATH:~/.local/bin/
         chmod +x "${GITHUB_WORKSPACE}/.github/workflows/memcheck_asan.sh"
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 518079c7..822b8fc0 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -71,7 +71,7 @@ jobs:
         sudo pip3 install meson
         sudo apt-get install libturbojpeg0-dev libpng-dev libwebp-dev
 
-        meson . build -Dbindings="capi" -Dloaders="all, webp_beta" -Dsavers="all" -Dtools="all"
+        meson . build -Dbindings="capi" -Dloaders="all" -Dsavers="all" -Dtools="all"
         sudo ninja -C build install
          
 
diff --git a/meson.build b/meson.build
index e9cf13dd..d46eb560 100644
--- a/meson.build
+++ b/meson.build
@@ -62,7 +62,7 @@ if all_loaders or get_option('loaders').contains('ttf') == true
     config_h.set10('THORVG_TTF_LOADER_SUPPORT', true)
 endif
 
-if get_option('loaders').contains('webp_beta') == true
+if all_loaders or get_option('loaders').contains('webp') == true
     config_h.set10('THORVG_WEBP_LOADER_SUPPORT', true)
 endif
 
@@ -134,31 +134,31 @@ endif
 summary = '''
 
 Summary:
-    ThorVG version:        @0@
-    Build Type:            @1@
-    Prefix:                @2@
-    Multi-Tasking:         @3@
-    SIMD Instruction:      @4@
-    Raster Engine (SW):    @5@
-    Raster Engine (GL):    @6@
-    Raster Engine (WG):    @7@
-    Loader (TVG):          @8@
-    Loader (SVG):          @9@
-    Loader (TTF):          @10@
-    Loader (LOTTIE):       @11@
-    Loader (PNG):          @12@
-    Loader (JPG):          @13@
-    Loader (WEBP_BETA):    @14@
-    Saver (TVG):           @15@
-    Saver (GIF):           @16@
-    Binding (CAPI):        @17@
-    Binding (WASM_BETA):   @18@
-    Log Message:           @19@
-    Tests:                 @20@
-    Examples:              @21@
-    Tool (Svg2Tvg):        @22@
-    Tool (Svg2Png):        @23@
-    Tool (Lottie2Gif):     @24@
+    ThorVG version:          @0@
+    Build Type:              @1@
+    Prefix:                  @2@
+    Multi-Tasking:           @3@
+    SIMD Instruction:        @4@
+    Raster Engine (SW):      @5@
+    Raster Engine (GL_BETA): @6@
+    Raster Engine (WG_BETA): @7@
+    Loader (TVG):            @8@
+    Loader (SVG):            @9@
+    Loader (TTF):            @10@
+    Loader (LOTTIE):         @11@
+    Loader (PNG):            @12@
+    Loader (JPG):            @13@
+    Loader (WEBP):           @14@
+    Saver (TVG):             @15@
+    Saver (GIF):             @16@
+    Binding (CAPI):          @17@
+    Binding (WASM_BETA):     @18@
+    Log Message:             @19@
+    Tests:                   @20@
+    Examples:                @21@
+    Tool (Svg2Tvg):          @22@
+    Tool (Svg2Png):          @23@
+    Tool (Lottie2Gif):       @24@
 
 '''.format(
         meson.project_version(),
@@ -175,7 +175,7 @@ Summary:
         all_loaders or get_option('loaders').contains('lottie'),
         all_loaders or get_option('loaders').contains('png'),
         all_loaders or get_option('loaders').contains('jpg'),
-        get_option('loaders').contains('webp_beta'),
+        all_loaders or get_option('loaders').contains('webp'),
         all_savers or get_option('savers').contains('tvg'),
         all_savers or get_option('savers').contains('gif'),
         get_option('bindings').contains('capi'),
diff --git a/meson_options.txt b/meson_options.txt
index c9e6c89a..b2501ceb 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -6,9 +6,9 @@ option('engines',
 
 option('loaders',
    type: 'array',
-   choices: ['', 'tvg', 'svg', 'png', 'jpg', 'lottie', 'ttf', 'webp_beta', 'all'],
+   choices: ['', 'tvg', 'svg', 'png', 'jpg', 'lottie', 'ttf', 'webp', 'all'],
    value: ['svg', 'tvg', 'lottie'],
-   description: 'Enable File Loaders in thorvg ("all" does not include "*_beta".)')
+   description: 'Enable File Loaders in thorvg')
 
 option('savers',
    type: 'array',
diff --git a/src/examples/PictureWebp.cpp b/src/examples/PictureWebp.cpp
index 31a9eaeb..b5b2c50d 100644
--- a/src/examples/PictureWebp.cpp
+++ b/src/examples/PictureWebp.cpp
@@ -75,7 +75,6 @@ void tvgDrawCmds(tvg::Canvas* canvas)
     picture->translate(400, 0);
     picture->scale(0.8);
     canvas->push(std::move(picture));
-
 }
 
 
diff --git a/src/loaders/external_webp/meson.build b/src/loaders/external_webp/meson.build
index 70cdead6..69c3e336 100644
--- a/src/loaders/external_webp/meson.build
+++ b/src/loaders/external_webp/meson.build
@@ -3,10 +3,12 @@ source_file = [
    'tvgWebpLoader.cpp',
 ]
 
-webp_dep = dependency('libwebp', required: true)
+webp_dep = dependency('libwebp', required: false)
 
-subloader_dep += [declare_dependency(
-    include_directories : include_directories('.'),
-    dependencies : webp_dep,
-    sources : source_file
-    )]
\ No newline at end of file
+if webp_dep.found()
+   subloader_dep += [declare_dependency(
+      include_directories : include_directories('.'),
+      dependencies : webp_dep,
+      sources : source_file
+      )]
+endif
\ No newline at end of file
diff --git a/src/loaders/meson.build b/src/loaders/meson.build
index 16526802..41db2a13 100644
--- a/src/loaders/meson.build
+++ b/src/loaders/meson.build
@@ -38,11 +38,14 @@ if all_loaders or get_option('loaders').contains('jpg') == true
     endif
 endif
 
-if get_option('loaders').contains('webp_beta') == true
+if all_loaders or get_option('loaders').contains('webp') == true
     if get_option('static') == true
-        message('static webp is not available, disable webp_beta loader.')
+        subdir('webp')
     else
         subdir('external_webp')
+        if not webp_dep.found()
+            subdir('webp')
+        endif
     endif
 endif
 
diff --git a/src/loaders/webp/dec/alpha.cpp b/src/loaders/webp/dec/alpha.cpp
new file mode 100644
index 00000000..3d3df148
--- /dev/null
+++ b/src/loaders/webp/dec/alpha.cpp
@@ -0,0 +1,167 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha-plane decompression.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./alphai.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "../dsp/dsp.h"
+#include "../utils/quant_levels_dec.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+//------------------------------------------------------------------------------
+// ALPHDecoder object.
+
+ALPHDecoder* ALPHNew(void) {
+  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1ULL, sizeof(*dec));
+  return dec;
+}
+
+void ALPHDelete(ALPHDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LDelete(dec->vp8l_dec_);
+    dec->vp8l_dec_ = NULL;
+    free(dec);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decoding.
+
+// Initialize alpha decoding by parsing the alpha header and decoding the image
+// header for alpha data stored using lossless compression.
+// Returns false in case of error in alpha header (data too short, invalid
+// compression method or filter, error in lossless header data etc).
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                    size_t data_size, int width, int height, uint8_t* output) {
+  int ok = 0;
+  const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
+  const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+  int rsrv;
+
+  assert(width > 0 && height > 0);
+  assert(data != NULL && output != NULL);
+
+  dec->width_ = width;
+  dec->height_ = height;
+
+  if (data_size <= ALPHA_HEADER_LEN) {
+    return 0;
+  }
+
+  dec->method_ = (data[0] >> 0) & 0x03;
+  dec->filter_ = static_cast<WEBP_FILTER_TYPE>((data[0] >> 2) & 0x03);
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
+  rsrv = (data[0] >> 6) & 0x03;
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
+      rsrv != 0) {
+    return 0;
+  }
+
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
+    ok = (alpha_data_size >= alpha_decoded_size);
+  } else {
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
+  }
+  VP8FiltersInit();
+  return ok;
+}
+
+// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
+// starting from row number 'row'. It assumes that rows up to (row - 1) have
+// already been decoded.
+// Returns false in case of bitstream error.
+static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+  ALPHDecoder* const alph_dec = dec->alph_dec_;
+  const int width = alph_dec->width_;
+  const int height = alph_dec->height_;
+  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
+  uint8_t* const output = dec->alpha_plane_;
+  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t offset = row * width;
+    const size_t num_pixels = num_rows * width;
+    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
+    memcpy(dec->alpha_plane_ + offset,
+           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
+    assert(alph_dec->vp8l_dec_ != NULL);
+    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
+      return 0;
+    }
+  }
+
+  if (unfilter_func != NULL) {
+    unfilter_func(width, height, width, row, num_rows, output);
+  }
+
+  if (row + num_rows == dec->pic_hdr_.height_) {
+    dec->is_alpha_decoded_ = 1;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point.
+
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows) {
+  const int width = dec->pic_hdr_.width_;
+  const int height = dec->pic_hdr_.height_;
+
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
+    return NULL;    // sanity check.
+  }
+
+  if (row == 0) {
+    // Initialize decoding.
+    assert(dec->alpha_plane_ != NULL);
+    dec->alph_dec_ = ALPHNew();
+    if (dec->alph_dec_ == NULL) return NULL;
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                  width, height, dec->alpha_plane_)) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+      return NULL;
+    }
+    // if we allowed use of alpha dithering, check whether it's needed at all
+    if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
+      dec->alpha_dithering_ = 0;  // disable dithering
+    } else {
+      num_rows = height;          // decode everything in one pass
+    }
+  }
+
+  if (!dec->is_alpha_decoded_) {
+    int ok = 0;
+    assert(dec->alph_dec_ != NULL);
+    ok = ALPHDecode(dec, row, num_rows);
+    if (ok && dec->alpha_dithering_ > 0) {
+      ok = WebPDequantizeLevels(dec->alpha_plane_, width, height,
+                                dec->alpha_dithering_);
+    }
+    if (!ok || dec->is_alpha_decoded_) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+    }
+    if (!ok) return NULL;  // Error.
+  }
+
+  // Return a pointer to the current decoded row.
+  return dec->alpha_plane_ + row * width;
+}
diff --git a/src/loaders/webp/dec/alphai.h b/src/loaders/webp/dec/alphai.h
new file mode 100644
index 00000000..058e883c
--- /dev/null
+++ b/src/loaders/webp/dec/alphai.h
@@ -0,0 +1,55 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha decoder: internal header.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_DEC_ALPHAI_H_
+#define WEBP_DEC_ALPHAI_H_
+
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8LDecoder;  // Defined in dec/vp8li.h.
+
+typedef struct ALPHDecoder ALPHDecoder;
+struct ALPHDecoder {
+  int width_;
+  int height_;
+  int method_;
+  WEBP_FILTER_TYPE filter_;
+  int pre_processing_;
+  struct VP8LDecoder* vp8l_dec_;
+  VP8Io io_;
+  int use_8b_decode;  // Although alpha channel requires only 1 byte per
+                      // pixel, sometimes VP8LDecoder may need to allocate
+                      // 4 bytes per pixel internally during decode.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// Allocates a new alpha decoder instance.
+ALPHDecoder* ALPHNew(void);
+
+// Clears and deallocates an alpha decoder instance.
+void ALPHDelete(ALPHDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_ALPHAI_H_ */
diff --git a/src/loaders/webp/dec/buffer.cpp b/src/loaders/webp/dec/buffer.cpp
new file mode 100644
index 00000000..bdddcc93
--- /dev/null
+++ b/src/loaders/webp/dec/buffer.cpp
@@ -0,0 +1,238 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Everything about WebPDecBuffer
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./webpi.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer
+
+// Number of bytes per pixel for the different color-spaces.
+static const int kModeBpp[MODE_LAST] = {
+  3, 4, 3, 4, 4, 2, 2,
+  4, 4, 4, 2,    // pre-multiplied modes
+  1, 1 };
+
+// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
+// Convert to an integer to handle both the unsigned/signed enum cases
+// without the need for casting to remove type limit warnings.
+static int IsValidColorspace(int webp_csp_mode) {
+  return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
+}
+
+static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
+  int ok = 1;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+  const int width = buffer->width;
+  const int height = buffer->height;
+  if (!IsValidColorspace(mode)) {
+    ok = 0;
+  } else if (!WebPIsRGBMode(mode)) {   // YUV checks
+    const WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int y_stride = abs(buf->y_stride);
+    const int u_stride = abs(buf->u_stride);
+    const int v_stride = abs(buf->v_stride);
+    const int a_stride = abs(buf->a_stride);
+    const uint64_t y_size = (uint64_t)y_stride * height;
+    const uint64_t u_size = (uint64_t)u_stride * ((height + 1) / 2);
+    const uint64_t v_size = (uint64_t)v_stride * ((height + 1) / 2);
+    const uint64_t a_size = (uint64_t)a_stride * height;
+    ok &= (y_size <= buf->y_size);
+    ok &= (u_size <= buf->u_size);
+    ok &= (v_size <= buf->v_size);
+    ok &= (y_stride >= width);
+    ok &= (u_stride >= (width + 1) / 2);
+    ok &= (v_stride >= (width + 1) / 2);
+    ok &= (buf->y != NULL);
+    ok &= (buf->u != NULL);
+    ok &= (buf->v != NULL);
+    if (mode == MODE_YUVA) {
+      ok &= (a_stride >= width);
+      ok &= (a_size <= buf->a_size);
+      ok &= (buf->a != NULL);
+    }
+  } else {    // RGB checks
+    const WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    const int stride = abs(buf->stride);
+    const uint64_t size = (uint64_t)stride * height;
+    ok &= (size <= buf->size);
+    ok &= (stride >= width * kModeBpp[mode]);
+    ok &= (buf->rgba != NULL);
+  }
+  return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
+}
+
+static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
+  const int w = buffer->width;
+  const int h = buffer->height;
+  const WEBP_CSP_MODE mode = buffer->colorspace;
+
+  if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+
+  if (!buffer->is_external_memory && buffer->private_memory == NULL) {
+    uint8_t* output;
+    int uv_stride = 0, a_stride = 0;
+    uint64_t uv_size = 0, a_size = 0, total_size;
+    // We need memory and it hasn't been allocated yet.
+    // => initialize output buffer, now that dimensions are known.
+    const int stride = w * kModeBpp[mode];
+    const uint64_t size = (uint64_t)stride * h;
+
+    if (!WebPIsRGBMode(mode)) {
+      uv_stride = (w + 1) / 2;
+      uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
+      if (mode == MODE_YUVA) {
+        a_stride = w;
+        a_size = (uint64_t)a_stride * h;
+      }
+    }
+    total_size = size + 2 * uv_size + a_size;
+
+    // Security/sanity checks
+    output = (uint8_t*)malloc(total_size * sizeof(*output));
+    if (output == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    buffer->private_memory = output;
+
+    if (!WebPIsRGBMode(mode)) {   // YUVA initialization
+      WebPYUVABuffer* const buf = &buffer->u.YUVA;
+      buf->y = output;
+      buf->y_stride = stride;
+      buf->y_size = (size_t)size;
+      buf->u = output + size;
+      buf->u_stride = uv_stride;
+      buf->u_size = (size_t)uv_size;
+      buf->v = output + size + uv_size;
+      buf->v_stride = uv_stride;
+      buf->v_size = (size_t)uv_size;
+      if (mode == MODE_YUVA) {
+        buf->a = output + size + 2 * uv_size;
+      }
+      buf->a_size = (size_t)a_size;
+      buf->a_stride = a_stride;
+    } else {  // RGBA initialization
+      WebPRGBABuffer* const buf = &buffer->u.RGBA;
+      buf->rgba = output;
+      buf->stride = stride;
+      buf->size = (size_t)size;
+    }
+  }
+  return CheckDecBuffer(buffer);
+}
+
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
+  if (buffer == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (WebPIsRGBMode(buffer->colorspace)) {
+    WebPRGBABuffer* const buf = &buffer->u.RGBA;
+    buf->rgba += (buffer->height - 1) * buf->stride;
+    buf->stride = -buf->stride;
+  } else {
+    WebPYUVABuffer* const buf = &buffer->u.YUVA;
+    const int H = buffer->height;
+    buf->y += (H - 1) * buf->y_stride;
+    buf->y_stride = -buf->y_stride;
+    buf->u += ((H - 1) >> 1) * buf->u_stride;
+    buf->u_stride = -buf->u_stride;
+    buf->v += ((H - 1) >> 1) * buf->v_stride;
+    buf->v_stride = -buf->v_stride;
+    if (buf->a != NULL) {
+      buf->a += (H - 1) * buf->a_stride;
+      buf->a_stride = -buf->a_stride;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+VP8StatusCode WebPAllocateDecBuffer(int w, int h,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const out) {
+  VP8StatusCode status;
+  if (out == NULL || w <= 0 || h <= 0) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  if (options != NULL) {    // First, apply options if there is any.
+    if (options->use_cropping) {
+      const int cw = options->crop_width;
+      const int ch = options->crop_height;
+      const int x = options->crop_left & ~1;
+      const int y = options->crop_top & ~1;
+      if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) {
+        return VP8_STATUS_INVALID_PARAM;   // out of frame boundary.
+      }
+      w = cw;
+      h = ch;
+    }
+    if (options->use_scaling) {
+      if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+        return VP8_STATUS_INVALID_PARAM;
+      }
+      w = options->scaled_width;
+      h = options->scaled_height;
+    }
+  }
+  out->width = w;
+  out->height = h;
+
+  // Then, allocate buffer for real.
+  status = AllocateBuffer(out);
+  if (status != VP8_STATUS_OK) return status;
+
+  // Use the stride trick if vertical flip is needed.
+  if (options != NULL && options->flip) {
+    status = WebPFlipBuffer(out);
+  }
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// constructors / destructors
+
+int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // version mismatch
+  }
+  if (buffer == NULL) return 0;
+  memset(buffer, 0, sizeof(*buffer));
+  return 1;
+}
+
+void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
+  if (buffer != NULL) {
+    if (!buffer->is_external_memory) {
+      free(buffer->private_memory);
+    }
+    buffer->private_memory = NULL;
+  }
+}
+
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst) {
+  if (src != NULL && dst != NULL) {
+    *dst = *src;
+    if (src->private_memory != NULL) {
+      dst->is_external_memory = 1;   // dst buffer doesn't own the memory.
+      dst->private_memory = NULL;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/src/loaders/webp/dec/common.h b/src/loaders/webp/dec/common.h
new file mode 100644
index 00000000..6961e224
--- /dev/null
+++ b/src/loaders/webp/dec/common.h
@@ -0,0 +1,54 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Definitions and macros common to encoding and decoding
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DEC_COMMON_H_
+#define WEBP_DEC_COMMON_H_
+
+// intra prediction modes
+enum { B_DC_PRED = 0,   // 4x4 modes
+       B_TM_PRED = 1,
+       B_VE_PRED = 2,
+       B_HE_PRED = 3,
+       B_RD_PRED = 4,
+       B_VR_PRED = 5,
+       B_LD_PRED = 6,
+       B_VL_PRED = 7,
+       B_HD_PRED = 8,
+       B_HU_PRED = 9,
+       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
+
+       // Luma16 or UV modes
+       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
+       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
+       B_PRED = NUM_BMODES,   // refined I4x4 mode
+       NUM_PRED_MODES = 4,
+
+       // special modes
+       B_DC_PRED_NOTOP = 4,
+       B_DC_PRED_NOLEFT = 5,
+       B_DC_PRED_NOTOPLEFT = 6,
+       NUM_B_DC_MODES = 7 };
+
+enum { MB_FEATURE_TREE_PROBS = 3,
+       NUM_MB_SEGMENTS = 4,
+       NUM_REF_LF_DELTAS = 4,
+       NUM_MODE_LF_DELTAS = 4,    // I4x4, ZERO, *, SPLIT
+       MAX_NUM_PARTITIONS = 8,
+       // Probabilities
+       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
+       NUM_BANDS = 8,
+       NUM_CTX = 3,
+       NUM_PROBAS = 11
+     };
+
+#endif    // WEBP_DEC_COMMON_H_
diff --git a/src/loaders/webp/dec/decode_vp8.h b/src/loaders/webp/dec/decode_vp8.h
new file mode 100644
index 00000000..236342fa
--- /dev/null
+++ b/src/loaders/webp/dec/decode_vp8.h
@@ -0,0 +1,177 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Low-level API for VP8 decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_DECODE_VP8_H_
+#define WEBP_WEBP_DECODE_VP8_H_
+
+#include "../webp/decode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Lower-level API
+//
+// These functions provide fine-grained control of the decoding process.
+// The call flow should resemble:
+//
+//   VP8Io io;
+//   VP8InitIo(&io);
+//   io.data = data;
+//   io.data_size = size;
+//   /* customize io's functions (setup()/put()/teardown()) if needed. */
+//
+//   VP8Decoder* dec = VP8New();
+//   bool ok = VP8Decode(dec);
+//   if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
+//   VP8Delete(dec);
+//   return ok;
+
+// Input / Output
+typedef struct VP8Io VP8Io;
+typedef int (*VP8IoPutHook)(const VP8Io* io);
+typedef int (*VP8IoSetupHook)(VP8Io* io);
+typedef void (*VP8IoTeardownHook)(const VP8Io* io);
+
+struct VP8Io {
+  // set by VP8GetHeaders()
+  int width, height;         // picture dimensions, in pixels (invariable).
+                             // These are the original, uncropped dimensions.
+                             // The actual area passed to put() is stored
+                             // in mb_w / mb_h fields.
+
+  // set before calling put()
+  int mb_y;                  // position of the current rows (in pixels)
+  int mb_w;                  // number of columns in the sample
+  int mb_h;                  // number of rows in the sample
+  const uint8_t* y, *u, *v;  // rows to copy (in yuv420 format)
+  int y_stride;              // row stride for luma
+  int uv_stride;             // row stride for chroma
+
+  void* opaque;              // user data
+
+  // called when fresh samples are available. Currently, samples are in
+  // YUV420 format, and can be up to width x 24 in size (depending on the
+  // in-loop filtering level, e.g.). Should return false in case of error
+  // or abort request. The actual size of the area to update is mb_w x mb_h
+  // in size, taking cropping into account.
+  VP8IoPutHook put;
+
+  // called just before starting to decode the blocks.
+  // Must return false in case of setup error, true otherwise. If false is
+  // returned, teardown() will NOT be called. But if the setup succeeded
+  // and true is returned, then teardown() will always be called afterward.
+  VP8IoSetupHook setup;
+
+  // Called just after block decoding is finished (or when an error occurred
+  // during put()). Is NOT called if setup() failed.
+  VP8IoTeardownHook teardown;
+
+  // this is a recommendation for the user-side yuv->rgb converter. This flag
+  // is set when calling setup() hook and can be overwritten by it. It then
+  // can be taken into consideration during the put() method.
+  int fancy_upsampling;
+
+  // Input buffer.
+  size_t data_size;
+  const uint8_t* data;
+
+  // If true, in-loop filtering will not be performed even if present in the
+  // bitstream. Switching off filtering may speed up decoding at the expense
+  // of more visible blocking. Note that output will also be non-compliant
+  // with the VP8 specifications.
+  int bypass_filtering;
+
+  // Cropping parameters.
+  int use_cropping;
+  int crop_left, crop_right, crop_top, crop_bottom;
+
+  // Scaling parameters.
+  int use_scaling;
+  int scaled_width, scaled_height;
+
+  // If non NULL, pointer to the alpha data (if present) corresponding to the
+  // start of the current row (That is: it is pre-offset by mb_y and takes
+  // cropping into account).
+  const uint8_t* a;
+};
+
+// Internal, version-checked, entry point
+int VP8InitIoInternal(VP8Io* const, int);
+
+
+// Main decoding object. This is an opaque structure.
+typedef struct VP8Decoder VP8Decoder;
+
+// Create a new decoder object.
+VP8Decoder* VP8New(void);
+
+// Must be called to make sure 'io' is initialized properly.
+// Returns false in case of version mismatch. Upon such failure, no other
+// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
+static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
+  return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
+}
+
+// Decode the VP8 frame header. Returns true if ok.
+// Note: 'io->data' must be pointing to the start of the VP8 frame header.
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
+
+// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
+// Returns false in case of error.
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
+
+// Return current status of the decoder:
+VP8StatusCode VP8Status(VP8Decoder* const dec);
+
+// return readable string corresponding to the last status.
+const char* VP8StatusMessage(VP8Decoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Not a mandatory call between calls to VP8Decode().
+void VP8Clear(VP8Decoder* const dec);
+
+// Destroy the decoder object.
+void VP8Delete(VP8Decoder* const dec);
+
+//------------------------------------------------------------------------------
+// Miscellaneous VP8/VP8L bitstream probing functions.
+
+// Returns true if the next 3 bytes in data contain the VP8 signature.
+WEBP_EXTERN(int) VP8CheckSignature(const uint8_t* const data, size_t data_size);
+
+// Validates the VP8 data-header and retrieves basic header information viz
+// width and height. Returns 0 in case of formatting error. *width/*height
+// can be passed NULL.
+WEBP_EXTERN(int) VP8GetInfo(
+    const uint8_t* data,
+    size_t data_size,    // data available so far
+    size_t chunk_size,   // total data size expected in the chunk
+    int* const width, int* const height);
+
+// Returns true if the next byte(s) in data is a VP8L signature.
+WEBP_EXTERN(int) VP8LCheckSignature(const uint8_t* const data, size_t size);
+
+// Validates the VP8L data-header and retrieves basic header information viz
+// width, height and alpha. Returns 0 in case of formatting error.
+// width/height/has_alpha can be passed NULL.
+WEBP_EXTERN(int) VP8LGetInfo(
+    const uint8_t* data, size_t data_size,  // data available so far
+    int* const width, int* const height, int* const has_alpha);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_WEBP_DECODE_VP8_H_ */
diff --git a/src/loaders/webp/dec/frame.cpp b/src/loaders/webp/dec/frame.cpp
new file mode 100644
index 00000000..1f388a38
--- /dev/null
+++ b/src/loaders/webp/dec/frame.cpp
@@ -0,0 +1,713 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Frame-reconstruction function. Memory allocation.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+#include "./vp8i.h"
+#include "../utils/utils.h"
+
+#define ALIGN_CST (32 - 1)
+#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+
+//------------------------------------------------------------------------------
+// Main reconstruction function.
+
+static const int kScan[16] = {
+  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
+  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
+  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
+  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
+};
+
+static int CheckMode(int mb_x, int mb_y, int mode) {
+  if (mode == B_DC_PRED) {
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    } else {
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+    }
+  }
+  return mode;
+}
+
+static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
+  memcpy(dst, src, 4);
+}
+
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
+}
+
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}
+
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx) {
+  int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
+  uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
+  uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
+  uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+
+  // Initialize left-most block.
+  for (j = 0; j < 16; ++j) {
+    y_dst[j * BPS - 1] = 129;
+  }
+  for (j = 0; j < 8; ++j) {
+    u_dst[j * BPS - 1] = 129;
+    v_dst[j * BPS - 1] = 129;
+  }
+
+  // Init top-left sample on left column too.
+  if (mb_y > 0) {
+    y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+  } else {
+    // we only need to do this init once at block (0,0).
+    // Afterward, it remains valid for the whole topmost row.
+    memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+    memset(u_dst - BPS - 1, 127, 8 + 1);
+    memset(v_dst - BPS - 1, 127, 8 + 1);
+  }
+
+  // Reconstruct one row.
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
+
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+      }
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      }
+    }
+    {
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;
+
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      }
+
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
+        }
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
+          uint8_t* const dst = y_dst + kScan[n];
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
+        }
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
+        }
+      }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
+      }
+
+      // stash away top samples for next block
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
+      }
+    }
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Filtering
+
+// kFilterExtraRows[] = How many extra lines are needed on the MB boundary
+// for caching, given a filtering level.
+// Simple filter:  up to 2 luma samples are read and 1 is written.
+// Complex filter: up to 4 luma samples are read and 3 are written. Same for
+//                 U/V, so it's 8 samples total (because of the 2x upsampling).
+static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
+
+static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
+  const int y_bps = dec->cache_y_stride_;
+  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
+  const int ilevel = f_info->f_ilevel_;
+  const int limit = f_info->f_limit_;
+  if (limit == 0) {
+    return;
+  }
+  assert(limit >= 3);
+  if (dec->filter_type_ == 1) {   // simple
+    if (mb_x > 0) {
+      VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (f_info->f_inner_) {
+      VP8SimpleHFilter16i(y_dst, y_bps, limit);
+    }
+    if (mb_y > 0) {
+      VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
+    }
+    if (f_info->f_inner_) {
+      VP8SimpleVFilter16i(y_dst, y_bps, limit);
+    }
+  } else {    // complex
+    const int uv_bps = dec->cache_uv_stride_;
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh = f_info->hev_thresh_;
+    if (mb_x > 0) {
+      VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (f_info->f_inner_) {
+      VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+    if (mb_y > 0) {
+      VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
+      VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
+    }
+    if (f_info->f_inner_) {
+      VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
+      VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
+    }
+  }
+}
+
+// Filter the decoded macroblock row (if needed)
+static void FilterRow(const VP8Decoder* const dec) {
+  int mb_x;
+  const int mb_y = dec->thread_ctx_.mb_y_;
+  assert(dec->thread_ctx_.filter_row_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    DoFilter(dec, mb_x, mb_y);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
+
+static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
+  if (dec->filter_type_ > 0) {
+    int s;
+    const VP8FilterHeader* const hdr = &dec->filter_hdr_;
+    for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+      int i4x4;
+      // First, compute the initial level
+      int base_level;
+      if (dec->segment_hdr_.use_segment_) {
+        base_level = dec->segment_hdr_.filter_strength_[s];
+        if (!dec->segment_hdr_.absolute_delta_) {
+          base_level += hdr->level_;
+        }
+      } else {
+        base_level = hdr->level_;
+      }
+      for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
+        VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
+        int level = base_level;
+        if (hdr->use_lf_delta_) {
+          level += hdr->ref_lf_delta_[0];
+          if (i4x4) {
+            level += hdr->mode_lf_delta_[0];
+          }
+        }
+        level = (level < 0) ? 0 : (level > 63) ? 63 : level;
+        if (level > 0) {
+          int ilevel = level;
+          if (hdr->sharpness_ > 0) {
+            if (hdr->sharpness_ > 4) {
+              ilevel >>= 2;
+            } else {
+              ilevel >>= 1;
+            }
+            if (ilevel > 9 - hdr->sharpness_) {
+              ilevel = 9 - hdr->sharpness_;
+            }
+          }
+          if (ilevel < 1) ilevel = 1;
+          info->f_ilevel_ = ilevel;
+          info->f_limit_ = 2 * level + ilevel;
+          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+        } else {
+          info->f_limit_ = 0;  // no filtering
+        }
+        info->f_inner_ = i4x4;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
+        }
+        all_amp |= dqm->dither_;
+      }
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
+      }
+    }
+    // potentially allow alpha dithering
+    dec->alpha_dithering_ = options->alpha_dithering_strength;
+    if (dec->alpha_dithering_ > 100) {
+      dec->alpha_dithering_ = 100;
+    } else if (dec->alpha_dithering_ < 0) {
+      dec->alpha_dithering_ = 0;
+    }
+  }
+}
+
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
+  }
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// This function is called after a row of macroblocks is finished decoding.
+// It also takes into account the following restrictions:
+//  * In case of in-loop filtering, we must hold off sending some of the bottom
+//    pixels as they are yet unfiltered. They will be when the next macroblock
+//    row is decoded. Meanwhile, we must preserve them by rotating them in the
+//    cache area. This doesn't hold for the very bottom row of the uncropped
+//    picture of course.
+//  * we must clip the remaining pixels against the cropping area. The VP8Io
+//    struct must have the following fields set correctly before calling put():
+
+#define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
+
+// Finalize and transmit a complete row. Return false in case of user-abort.
+static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
+  const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
+  const int ysize = extra_y_rows * dec->cache_y_stride_;
+  const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+  uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
+  uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
+  uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
+  const int mb_y = ctx->mb_y_;
+  const int is_first_row = (mb_y == 0);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
+
+  if (ctx->filter_row_) {
+    FilterRow(dec);
+  }
+
+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
+  if (io->put != NULL) {
+    int y_start = MACROBLOCK_VPOS(mb_y);
+    int y_end = MACROBLOCK_VPOS(mb_y + 1);
+    if (!is_first_row) {
+      y_start -= extra_y_rows;
+      io->y = ydst;
+      io->u = udst;
+      io->v = vdst;
+    } else {
+      io->y = dec->cache_y_ + y_offset;
+      io->u = dec->cache_u_ + uv_offset;
+      io->v = dec->cache_v_ + uv_offset;
+    }
+
+    if (!is_last_row) {
+      y_end -= extra_y_rows;
+    }
+    if (y_end > io->crop_bottom) {
+      y_end = io->crop_bottom;    // make sure we don't overflow on last row.
+    }
+    io->a = NULL;
+    if (dec->alpha_data_ != NULL && y_start < y_end) {
+      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
+      // good idea.
+      io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
+      if (io->a == NULL) {
+        return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                           "Could not decode alpha data.");
+      }
+    }
+    if (y_start < io->crop_top) {
+      const int delta_y = io->crop_top - y_start;
+      y_start = io->crop_top;
+      assert(!(delta_y & 1));
+      io->y += dec->cache_y_stride_ * delta_y;
+      io->u += dec->cache_uv_stride_ * (delta_y >> 1);
+      io->v += dec->cache_uv_stride_ * (delta_y >> 1);
+      if (io->a != NULL) {
+        io->a += io->width * delta_y;
+      }
+    }
+    if (y_start < y_end) {
+      io->y += io->crop_left;
+      io->u += io->crop_left >> 1;
+      io->v += io->crop_left >> 1;
+      if (io->a != NULL) {
+        io->a += io->crop_left;
+      }
+      io->mb_y = y_start - io->crop_top;
+      io->mb_w = io->crop_right - io->crop_left;
+      io->mb_h = y_end - y_start;
+      ok = io->put(io);
+    }
+  }
+  // rotate top samples if needed
+  if (cache_id + 1 == dec->num_caches_) {
+    if (!is_last_row) {
+      memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
+      memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
+      memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
+    }
+  }
+
+  return ok;
+}
+
+#undef MACROBLOCK_VPOS
+
+//------------------------------------------------------------------------------
+
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int filter_row =
+      (dec->filter_type_ > 0) &&
+      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  // ctx->id_ and ctx->f_info_ are already set
+  ctx->mb_y_ = dec->mb_y_;
+  ctx->filter_row_ = filter_row;
+  ReconstructRow(dec, ctx);
+  ok = FinishRow(dec, io);
+
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Finish setting up the decoding parameter once user's setup() is called.
+
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
+  // Call setup() first. This may trigger additional decoding features on 'io'.
+  // Note: Afterward, we must call teardown() no matter what.
+  if (io->setup != NULL && !io->setup(io)) {
+    VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
+    return dec->status_;
+  }
+
+  // Disable filtering per user request
+  if (io->bypass_filtering) {
+    dec->filter_type_ = 0;
+  }
+  // TODO(skal): filter type / strength / sharpness forcing
+
+  // Define the area where we can skip in-loop filtering, in case of cropping.
+  //
+  // 'Simple' filter reads two luma samples outside of the macroblock
+  // and filters one. It doesn't filter the chroma samples. Hence, we can
+  // avoid doing the in-loop filtering before crop_top/crop_left position.
+  // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
+  // Means: there's a dependency chain that goes all the way up to the
+  // top-left corner of the picture (MB #0). We must filter all the previous
+  // macroblocks.
+  // TODO(skal): add an 'approximate_decoding' option, that won't produce
+  // a 1:1 bit-exactness for complex filtering?
+  {
+    const int extra_pixels = kFilterExtraRows[dec->filter_type_];
+    if (dec->filter_type_ == 2) {
+      // For complex filter, we need to preserve the dependency chain.
+      dec->tl_mb_x_ = 0;
+      dec->tl_mb_y_ = 0;
+    } else {
+      // For simple filter, we can filter only the cropped region.
+      // We include 'extra_pixels' on the other side of the boundary, since
+      // vertical or horizontal filtering of the previous macroblock can
+      // modify some abutting pixels.
+      dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
+      dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
+      if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
+      if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
+    }
+    // We need some 'extra' pixels on the right/bottom.
+    dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
+    dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
+    if (dec->br_mb_x_ > dec->mb_w_) {
+      dec->br_mb_x_ = dec->mb_w_;
+    }
+    if (dec->br_mb_y_ > dec->mb_h_) {
+      dec->br_mb_y_ = dec->mb_h_;
+    }
+  }
+  PrecomputeFilterStrengths(dec);
+  return VP8_STATUS_OK;
+}
+
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 1;
+  if (io->teardown != NULL) {
+    io->teardown(io);
+  }
+  return ok;
+}
+
+#define ST_CACHE_LINES 1   // 1 cache row only for single-threaded case
+
+// Initialize multi/single-thread worker
+static int InitThreadContext(VP8Decoder* const dec) {
+  dec->cache_id_ = 0;
+  dec->num_caches_ = ST_CACHE_LINES;
+  return 1;
+}
+
+#undef ST_CACHE_LINES
+
+//------------------------------------------------------------------------------
+// Memory setup
+
+static int AllocateMemory(VP8Decoder* const dec) {
+  const int num_caches = dec->num_caches_;
+  const int mb_w = dec->mb_w_;
+  // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
+  const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
+  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
+  const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
+  const size_t f_info_size = (dec->filter_type_ > 0) ? mb_w * sizeof(VP8FInfo) : 0;
+  const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
+  const size_t mb_data_size = mb_w * sizeof(*dec->mb_data_);
+  const size_t cache_height = (16 * num_caches
+                            + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
+  const size_t cache_size = top_size * cache_height;
+  // alpha_size is the only one that scales as width x height.
+  const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
+      (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
+  const uint64_t needed = (uint64_t)intra_pred_mode_size
+                        + top_size + mb_info_size + f_info_size
+                        + yuv_size + mb_data_size
+                        + cache_size + alpha_size + ALIGN_CST;
+  uint8_t* mem;
+
+  if (needed != (size_t)needed) return 0;  // check for overflow
+  if (needed > dec->mem_size_) {
+    free(dec->mem_);
+    dec->mem_size_ = 0;
+    dec->mem_ = malloc(needed * sizeof(uint8_t));
+    if (dec->mem_ == NULL) {
+      return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
+                         "no memory during frame initialization.");
+    }
+    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    dec->mem_size_ = (size_t)needed;
+  }
+
+  mem = (uint8_t*)dec->mem_;
+  dec->intra_t_ = (uint8_t*)mem;
+  mem += intra_pred_mode_size;
+
+  dec->yuv_t_ = (VP8TopSamples*)mem;
+  mem += top_size;
+
+  dec->mb_info_ = ((VP8MB*)mem) + 1;
+  mem += mb_info_size;
+
+  dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
+  mem += f_info_size;
+  dec->thread_ctx_.id_ = 0;
+  dec->thread_ctx_.f_info_ = dec->f_info_;
+
+  mem = (uint8_t*)DO_ALIGN(mem);
+  assert((yuv_size & ALIGN_CST) == 0);
+  dec->yuv_b_ = (uint8_t*)mem;
+  mem += yuv_size;
+
+  dec->mb_data_ = (VP8MBData*)mem;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
+  mem += mb_data_size;
+
+  dec->cache_y_stride_ = 16 * mb_w;
+  dec->cache_uv_stride_ = 8 * mb_w;
+  {
+    const int extra_rows = kFilterExtraRows[dec->filter_type_];
+    const int extra_y = extra_rows * dec->cache_y_stride_;
+    const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
+    dec->cache_y_ = ((uint8_t*)mem) + extra_y;
+    dec->cache_u_ = dec->cache_y_
+                  + 16 * num_caches * dec->cache_y_stride_ + extra_uv;
+    dec->cache_v_ = dec->cache_u_
+                  + 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
+    dec->cache_id_ = 0;
+  }
+  mem += cache_size;
+
+  // alpha plane
+  dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL;
+  mem += alpha_size;
+  assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
+
+  // note: left/top-info is initialized once for all.
+  memset(dec->mb_info_ - 1, 0, mb_info_size);
+  VP8InitScanline(dec);   // initialize left too.
+
+  // initialize top
+  memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
+
+  return 1;
+}
+
+static void InitIo(VP8Decoder* const dec, VP8Io* io) {
+  // prepare 'io'
+  io->mb_y = 0;
+  io->y = dec->cache_y_;
+  io->u = dec->cache_u_;
+  io->v = dec->cache_v_;
+  io->y_stride = dec->cache_y_stride_;
+  io->uv_stride = dec->cache_uv_stride_;
+  io->a = NULL;
+}
+
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) {
+  if (!InitThreadContext(dec)) return 0;  // call first. Sets dec->num_caches_.
+  if (!AllocateMemory(dec)) return 0;
+  InitIo(dec, io);
+  VP8DspInit();  // Init critical function pointers and look-up tables.
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/dec/io.cpp b/src/loaders/webp/dec/io.cpp
new file mode 100644
index 00000000..d0d05496
--- /dev/null
+++ b/src/loaders/webp/dec/io.cpp
@@ -0,0 +1,392 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// functions for sample output.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../dec/vp8i.h"
+#include "./webpi.h"
+#include "../dsp/dsp.h"
+#include "../dsp/yuv.h"
+#include "../utils/utils.h"
+
+
+// Point-sampling U/V sampler.
+static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  WebPDecBuffer* const output = p->output;
+  WebPRGBABuffer* const buf = &output->u.RGBA;
+  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  WebPSamplerProcessPlane(io->y, io->y_stride,
+                          io->u, io->v, io->uv_stride,
+                          dst, buf->stride, io->mb_w, io->mb_h,
+                          WebPSamplers[output->colorspace]);
+  return io->mb_h;
+}
+
+//------------------------------------------------------------------------------
+// Fancy upsampling
+
+#ifdef FANCY_UPSAMPLING
+static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
+  int num_lines_out = io->mb_h;   // a priori guess
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
+  const uint8_t* cur_y = io->y;
+  const uint8_t* cur_u = io->u;
+  const uint8_t* cur_v = io->v;
+  const uint8_t* top_u = p->tmp_u;
+  const uint8_t* top_v = p->tmp_v;
+  int y = io->mb_y;
+  const int y_end = io->mb_y + io->mb_h;
+  const int mb_w = io->mb_w;
+  const int uv_w = (mb_w + 1) / 2;
+
+  if (y == 0) {
+    // First line is special cased. We mirror the u/v samples at boundary.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
+  } else {
+    // We can finish the left-over line from previous call.
+    upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+    ++num_lines_out;
+  }
+  // Loop over each output pairs of row.
+  for (; y + 2 < y_end; y += 2) {
+    top_u = cur_u;
+    top_v = cur_v;
+    cur_u += io->uv_stride;
+    cur_v += io->uv_stride;
+    dst += 2 * buf->stride;
+    cur_y += 2 * io->y_stride;
+    upsample(cur_y - io->y_stride, cur_y,
+             top_u, top_v, cur_u, cur_v,
+             dst - buf->stride, dst, mb_w);
+  }
+  // move to last row
+  cur_y += io->y_stride;
+  if (io->crop_top + y_end < io->crop_bottom) {
+    // Save the unfinished samples for next call (as we're not done yet).
+    memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
+    memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
+    memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
+    // The fancy upsampler leaves a row unfinished behind
+    // (except for the very last row)
+    num_lines_out--;
+  } else {
+    // Process the very last row of even-sized picture
+    if (!(y_end & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+               dst + buf->stride, NULL, mb_w);
+    }
+  }
+  return num_lines_out;
+}
+
+#endif    /* FANCY_UPSAMPLING */
+
+//------------------------------------------------------------------------------
+
+
+static int GetAlphaSourceRow(const VP8Io* const io,
+                             const uint8_t** alpha, int* const num_rows) {
+  int start_y = io->mb_y;
+  *num_rows = io->mb_h;
+
+  // Compensate for the 1-line delay of the fancy upscaler.
+  // This is similar to EmitFancyRGB().
+  if (io->fancy_upsampling) {
+    if (start_y == 0) {
+      // We don't process the last row yet. It'll be done during the next call.
+      --*num_rows;
+    } else {
+      --start_y;
+      // Fortunately, *alpha data is persistent, so we can go back
+      // one row and finish alpha blending, now that the fancy upscaler
+      // completed the YUV->RGB interpolation.
+      *alpha -= io->width;
+    }
+    if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
+      // If it's the very last call, we process all the remaining rows!
+      *num_rows = io->crop_bottom - io->crop_top - start_y;
+    }
+  }
+  return start_y;
+}
+
+static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const uint8_t* alpha = io->a;
+  if (alpha != NULL) {
+    const int mb_w = io->mb_w;
+    const WEBP_CSP_MODE colorspace = p->output->colorspace;
+    const int alpha_first =
+        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+    int num_rows;
+    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
+    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
+    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
+                                            num_rows, dst, buf->stride);
+
+    // has_alpha is true if there's non-trivial alpha to premultiply with.
+    if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
+      WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                             mb_w, num_rows, buf->stride);
+    }
+  }
+  return 0;
+}
+
+
+//------------------------------------------------------------------------------
+// RGBA rescaling
+
+static int ExportRGB(WebPDecParams* const p, int y_pos) {
+  const WebPYUV444Converter convert =
+      WebPYUV444Converters[p->output->colorspace];
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  int num_lines_out = 0;
+  // For RGB rescaling, because of the YUV420, current scan position
+  // U/V can be +1/-1 line from the Y one.  Hence the double test.
+  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
+         WebPRescalerHasPendingOutput(&p->scaler_u)) {
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
+    WebPRescalerExportRow(&p->scaler_y, 0);
+    WebPRescalerExportRow(&p->scaler_u, 0);
+    WebPRescalerExportRow(&p->scaler_v, 0);
+    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
+            dst, p->scaler_y.dst_width);
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
+  const int mb_h = io->mb_h;
+  const int uv_mb_h = (mb_h + 1) >> 1;
+  int j = 0, uv_j = 0;
+  int num_lines_out = 0;
+  while (j < mb_h) {
+    const int y_lines_in =
+        WebPRescalerImport(&p->scaler_y, mb_h - j,
+                           io->y + j * io->y_stride, io->y_stride);
+    const int u_lines_in =
+        WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+                           io->u + uv_j * io->uv_stride, io->uv_stride);
+    const int v_lines_in =
+        WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+                           io->v + uv_j * io->uv_stride, io->uv_stride);
+    (void)v_lines_in;   // remove a gcc warning
+    assert(u_lines_in == v_lines_in);
+    j += y_lines_in;
+    uv_j += u_lines_in;
+    num_lines_out += ExportRGB(p, num_lines_out);
+  }
+  return num_lines_out;
+}
+
+static int ExportAlpha(WebPDecParams* const p, int y_pos) {
+  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
+  uint8_t* const base_rgba = buf->rgba + (p->last_y + y_pos) * buf->stride;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int alpha_first =
+      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
+  uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
+  int num_lines_out = 0;
+  const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
+  uint32_t alpha_mask = 0xff;
+  const int width = p->scaler_a.dst_width;
+
+  while (WebPRescalerHasPendingOutput(&p->scaler_a)) {
+    int i;
+    assert(p->last_y + y_pos + num_lines_out < p->output->height);
+    WebPRescalerExportRow(&p->scaler_a, 0);
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = p->scaler_a.dst[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    dst += buf->stride;
+    ++num_lines_out;
+  }
+  if (is_premult_alpha && alpha_mask != 0xff) {
+    WebPApplyAlphaMultiply(base_rgba, alpha_first,
+                           width, num_lines_out, buf->stride);
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) {
+  if (io->a != NULL) {
+    WebPRescaler* const scaler = &p->scaler_a;
+    int j = 0;
+    int pos = 0;
+    while (j < io->mb_h) {
+      j += WebPRescalerImport(scaler, io->mb_h - j,
+                              io->a + j * io->width, io->width);
+      pos += p->emit_alpha_row(p, pos);
+    }
+  }
+  return 0;
+}
+
+static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
+  const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
+  const int out_width  = io->scaled_width;
+  const int out_height = io->scaled_height;
+  const int uv_in_width  = (io->mb_w + 1) >> 1;
+  const int uv_in_height = (io->mb_h + 1) >> 1;
+  const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
+  int32_t* work;  // rescalers work area
+  uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
+  size_t tmp_size1, tmp_size2, total_size;
+
+  tmp_size1 = 3 * work_size;
+  tmp_size2 = 3 * out_width;
+  if (has_alpha) {
+    tmp_size1 += work_size;
+    tmp_size2 += out_width;
+  }
+  total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
+  p->memory = calloc(1ULL, total_size);
+  if (p->memory == NULL) {
+    return 0;   // memory error
+  }
+  work = (int32_t*)p->memory;
+  tmp = (uint8_t*)(work + tmp_size1);
+  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+                   tmp + 0 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, out_width, io->mb_h, out_height,
+                   work + 0 * work_size);
+  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+                   tmp + 1 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 1 * work_size);
+  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+                   tmp + 2 * out_width, out_width, out_height, 0, 1,
+                   io->mb_w, 2 * out_width, io->mb_h, 2 * out_height,
+                   work + 2 * work_size);
+  p->emit = EmitRescaledRGB;
+  WebPInitYUV444Converters();
+
+  if (has_alpha) {
+    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+                     tmp + 3 * out_width, out_width, out_height, 0, 1,
+                     io->mb_w, out_width, io->mb_h, out_height,
+                     work + 3 * work_size);
+    p->emit_alpha = EmitRescaledAlphaRGB;
+    p->emit_alpha_row = ExportAlpha;
+    WebPInitAlphaProcessing();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Default custom functions
+
+static int CustomSetup(VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const WEBP_CSP_MODE colorspace = p->output->colorspace;
+  const int is_rgb = WebPIsRGBMode(colorspace);
+  const int is_alpha = WebPIsAlphaMode(colorspace);
+
+  p->memory = NULL;
+  p->emit = NULL;
+  p->emit_alpha = NULL;
+  p->emit_alpha_row = NULL;
+
+  if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
+    return 0;
+  }
+  if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
+    WebPInitUpsamplers();
+  }
+  if (io->use_scaling) {
+    const int ok = InitRGBRescaler(io, p);
+    if (!ok) {
+      return 0;    // memory error
+    }
+  } else {
+    if (is_rgb) {
+      p->emit = EmitSampledRGB;   // default
+      if (io->fancy_upsampling) {
+#ifdef FANCY_UPSAMPLING
+        const int uv_width = (io->mb_w + 1) >> 1;
+        p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
+        if (p->memory == NULL) {
+          return 0;   // memory error.
+        }
+        p->tmp_y = (uint8_t*)p->memory;
+        p->tmp_u = p->tmp_y + io->mb_w;
+        p->tmp_v = p->tmp_u + uv_width;
+        p->emit = EmitFancyRGB;
+        WebPInitUpsamplers();
+#endif
+      } else {
+        WebPInitSamplers();
+      }
+    }
+    if (is_alpha) {  // need transparency output
+      p->emit_alpha = EmitAlphaRGB;
+      if (is_rgb) WebPInitAlphaProcessing();
+    }
+  }
+
+  if (is_rgb) {
+    VP8YUVInit();
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static int CustomPut(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  const int mb_w = io->mb_w;
+  const int mb_h = io->mb_h;
+  int num_lines_out;
+  assert(!(io->mb_y & 1));
+
+  if (mb_w <= 0 || mb_h <= 0) {
+    return 0;
+  }
+  num_lines_out = p->emit(io, p);
+  if (p->emit_alpha != NULL) {
+    p->emit_alpha(io, p);
+  }
+  p->last_y += num_lines_out;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+static void CustomTeardown(const VP8Io* io) {
+  WebPDecParams* const p = (WebPDecParams*)io->opaque;
+  free(p->memory);
+  p->memory = NULL;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
+  io->put      = CustomPut;
+  io->setup    = CustomSetup;
+  io->teardown = CustomTeardown;
+  io->opaque   = params;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/dec/meson.build b/src/loaders/webp/dec/meson.build
new file mode 100644
index 00000000..7a35e281
--- /dev/null
+++ b/src/loaders/webp/dec/meson.build
@@ -0,0 +1,22 @@
+source_file = [
+   'alphai.h',
+   'common.h',
+   'decode_vp8.h',
+   'vp8i.h',
+   'vp8li.h',
+   'webpi.h',
+   'alpha.cpp',
+   'buffer.cpp',
+   'frame.cpp',   
+   'io.cpp',
+   'quant.cpp',
+   'tree.cpp',
+   'vp8.cpp',
+   'vp8l.cpp',
+   'webp.cpp'
+]
+
+webp_deb += [declare_dependency(
+   include_directories : include_directories('.'),
+   sources : source_file
+)]
diff --git a/src/loaders/webp/dec/quant.cpp b/src/loaders/webp/dec/quant.cpp
new file mode 100644
index 00000000..5b648f94
--- /dev/null
+++ b/src/loaders/webp/dec/quant.cpp
@@ -0,0 +1,110 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Quantizer initialization
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./vp8i.h"
+
+static WEBP_INLINE int clip(int v, int M) {
+  return v < 0 ? 0 : v > M ? M : v;
+}
+
+// Paragraph 14.1
+static const uint8_t kDcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  10,
+  11,   12,  13,  14,  15,  16,  17,  17,
+  18,   19,  20,  20,  21,  21,  22,  22,
+  23,   23,  24,  25,  25,  26,  27,  28,
+  29,   30,  31,  32,  33,  34,  35,  36,
+  37,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  46,  47,  48,  49,  50,
+  51,   52,  53,  54,  55,  56,  57,  58,
+  59,   60,  61,  62,  63,  64,  65,  66,
+  67,   68,  69,  70,  71,  72,  73,  74,
+  75,   76,  76,  77,  78,  79,  80,  81,
+  82,   83,  84,  85,  86,  87,  88,  89,
+  91,   93,  95,  96,  98, 100, 101, 102,
+  104, 106, 108, 110, 112, 114, 116, 118,
+  122, 124, 126, 128, 130, 132, 134, 136,
+  138, 140, 143, 145, 148, 151, 154, 157
+};
+
+static const uint16_t kAcTable[128] = {
+  4,     5,   6,   7,   8,   9,  10,  11,
+  12,   13,  14,  15,  16,  17,  18,  19,
+  20,   21,  22,  23,  24,  25,  26,  27,
+  28,   29,  30,  31,  32,  33,  34,  35,
+  36,   37,  38,  39,  40,  41,  42,  43,
+  44,   45,  46,  47,  48,  49,  50,  51,
+  52,   53,  54,  55,  56,  57,  58,  60,
+  62,   64,  66,  68,  70,  72,  74,  76,
+  78,   80,  82,  84,  86,  88,  90,  92,
+  94,   96,  98, 100, 102, 104, 106, 108,
+  110, 112, 114, 116, 119, 122, 125, 128,
+  131, 134, 137, 140, 143, 146, 149, 152,
+  155, 158, 161, 164, 167, 170, 173, 177,
+  181, 185, 189, 193, 197, 201, 205, 209,
+  213, 217, 221, 225, 229, 234, 239, 245,
+  249, 254, 259, 264, 269, 274, 279, 284
+};
+
+//------------------------------------------------------------------------------
+// Paragraph 9.6
+
+void VP8ParseQuant(VP8Decoder* const dec) {
+  VP8BitReader* const br = &dec->br_;
+  const int base_q0 = VP8GetValue(br, 7);
+  const int dqy1_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dqy2_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_dc = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+  const int dquv_ac = VP8Get(br) ? VP8GetSignedValue(br, 4) : 0;
+
+  const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
+  int i;
+
+  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
+    int q;
+    if (hdr->use_segment_) {
+      q = hdr->quantizer_[i];
+      if (!hdr->absolute_delta_) {
+        q += base_q0;
+      }
+    } else {
+      if (i > 0) {
+        dec->dqm_[i] = dec->dqm_[0];
+        continue;
+      } else {
+        q = base_q0;
+      }
+    }
+    {
+      VP8QuantMatrix* const m = &dec->dqm_[i];
+      m->y1_mat_[0] = kDcTable[clip(q + dqy1_dc, 127)];
+      m->y1_mat_[1] = kAcTable[clip(q + 0,       127)];
+
+      m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
+      // For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
+      // The smallest precision for that is '(x*6349) >> 12' but 16 is a good
+      // word size.
+      m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
+      if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
+
+      m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
+      m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/src/loaders/webp/dec/tree.cpp b/src/loaders/webp/dec/tree.cpp
new file mode 100644
index 00000000..c2007ea7
--- /dev/null
+++ b/src/loaders/webp/dec/tree.cpp
@@ -0,0 +1,525 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Coding trees and probas
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./vp8i.h"
+#include "../utils/bit_reader_inl.h"
+
+#define USE_GENERIC_TREE
+
+#ifdef USE_GENERIC_TREE
+static const int8_t kYModesIntra4[18] = {
+  -B_DC_PRED, 1,
+    -B_TM_PRED, 2,
+      -B_VE_PRED, 3,
+        4, 6,
+          -B_HE_PRED, 5,
+            -B_RD_PRED, -B_VR_PRED,
+        -B_LD_PRED, 7,
+          -B_VL_PRED, 8,
+            -B_HD_PRED, -B_HU_PRED
+};
+#endif
+
+//------------------------------------------------------------------------------
+// Default probabilities
+
+// Paragraph 13.5
+static const uint8_t
+  CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+      { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
+    },
+    { { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+      { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
+    },
+    { { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+      { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+      { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
+      { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
+      { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+    },
+    { { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+      { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+    },
+    { { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+      { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+      { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+    },
+    { { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+      { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+      { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+      { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+    },
+    { { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+      { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+      { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+    },
+    { { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+      { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+      { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+    },
+    { { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+      { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+      { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+      { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+      { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+    },
+    { { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+      { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+      { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    },
+    { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  },
+  { { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+      { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+      { 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+    },
+    { { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+      { 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+    },
+    { { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+      { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+      { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+    },
+    { { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+      { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+    },
+    { { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+      { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+    },
+    { { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+      { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+    },
+    { { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+      { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+      { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+    },
+    { { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }
+};
+
+// Paragraph 11.5
+static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
+  { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
+    { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
+    { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
+    { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
+    { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
+    { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
+    { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
+    { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
+    { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
+  { { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
+    { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
+    { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
+    { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
+    { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
+    { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
+    { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
+    { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
+    { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
+  { { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
+    { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
+    { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
+    { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
+    { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
+    { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
+    { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
+    { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
+    { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
+  { { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
+    { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
+    { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
+    { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
+    { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
+    { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
+    { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
+    { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
+    { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
+  { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
+    { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
+    { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
+    { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
+    { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
+    { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
+    { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
+    { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
+    { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
+  { { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
+    { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
+    { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
+    { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
+    { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
+    { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
+    { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
+    { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
+    { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
+  { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
+    { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
+    { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
+    { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
+    { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
+    { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
+    { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
+    { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
+    { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
+  { { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
+    { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
+    { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
+    { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
+    { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
+    { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
+    { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
+    { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
+    { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
+  { { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
+    { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
+    { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
+    { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
+    { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
+    { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
+    { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
+    { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
+    { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
+};
+
+void VP8ResetProba(VP8Proba* const proba) {
+  memset(proba->segments_, 255u, sizeof(proba->segments_));
+  // proba->bands_[][] is initialized later
+}
+
+static void ParseIntraMode(VP8BitReader* const br,
+                           VP8Decoder* const dec, int mb_x) {
+  uint8_t* const top = dec->intra_t_ + 4 * mb_x;
+  uint8_t* const left = dec->intra_l_;
+  VP8MBData* const block = dec->mb_data_ + mb_x;
+
+  // Note: we don't save segment map (yet), as we don't expect
+  // to decode more than 1 keyframe.
+  if (dec->segment_hdr_.update_map_) {
+    // Hardcoded tree parsing
+    block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0])
+                    ? VP8GetBit(br, dec->proba_.segments_[1])
+                    : 2 + VP8GetBit(br, dec->proba_.segments_[2]);
+  } else {
+    block->segment_ = 0;  // default for intra
+  }
+  if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_);
+
+  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!block->is_i4x4_) {
+    // Hardcoded 16x16 intra-mode decision tree.
+    const int ymode =
+        VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
+                           : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
+    block->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(*top));
+    memset(left, ymode, 4 * sizeof(*left));
+  } else {
+    uint8_t* modes = block->imodes_;
+    int y;
+    for (y = 0; y < 4; ++y) {
+      int ymode = left[y];
+      int x;
+      for (x = 0; x < 4; ++x) {
+        const uint8_t* const prob = kBModesProba[top[x]][ymode];
+#ifdef USE_GENERIC_TREE
+        // Generic tree-parsing
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        while (i > 0) {
+          i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
+        }
+        ymode = -i;
+#else
+        // Hardcoded tree parsing
+        ymode = !VP8GetBit(br, prob[0]) ? B_DC_PRED :
+                  !VP8GetBit(br, prob[1]) ? B_TM_PRED :
+                    !VP8GetBit(br, prob[2]) ? B_VE_PRED :
+                      !VP8GetBit(br, prob[3]) ?
+                        (!VP8GetBit(br, prob[4]) ? B_HE_PRED :
+                          (!VP8GetBit(br, prob[5]) ? B_RD_PRED : B_VR_PRED)) :
+                        (!VP8GetBit(br, prob[6]) ? B_LD_PRED :
+                          (!VP8GetBit(br, prob[7]) ? B_VL_PRED :
+                            (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
+#endif    // USE_GENERIC_TREE
+        top[x] = ymode;
+      }
+      memcpy(modes, top, 4 * sizeof(*top));
+      modes += 4;
+      left[y] = ymode;
+    }
+  }
+  // Hardcoded UVMode decision tree
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+                 : !VP8GetBit(br, 114) ? V_PRED
+                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+}
+
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
+  int mb_x;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    ParseIntraMode(br, dec, mb_x);
+  }
+  return !dec->br_.eof_;
+}
+
+//------------------------------------------------------------------------------
+// Paragraph 13
+
+static const uint8_t
+    CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
+  { { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
+      { 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  },
+  { { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    },
+    { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
+      { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
+    }
+  }
+};
+
+// Paragraph 9.9
+
+static const int kBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // extra entry as sentinel
+};
+
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
+  VP8Proba* const proba = &dec->proba_;
+  int t, b, c, p;
+  for (t = 0; t < NUM_TYPES; ++t) {
+    for (b = 0; b < NUM_BANDS; ++b) {
+      for (c = 0; c < NUM_CTX; ++c) {
+        for (p = 0; p < NUM_PROBAS; ++p) {
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          proba->bands_[t][b].probas_[c][p] = v;
+        }
+      }
+    }
+    for (b = 0; b < 16 + 1; ++b) {
+      proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
+    }
+  }
+  dec->use_skip_proba_ = VP8Get(br);
+  if (dec->use_skip_proba_) {
+    dec->skip_p_ = VP8GetValue(br, 8);
+  }
+}
+
diff --git a/src/loaders/webp/dec/vp8.cpp b/src/loaders/webp/dec/vp8.cpp
new file mode 100644
index 00000000..cdb0e8a3
--- /dev/null
+++ b/src/loaders/webp/dec/vp8.cpp
@@ -0,0 +1,681 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// main entry for the decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./alphai.h"
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+#include "../utils/bit_reader_inl.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+int WebPGetDecoderVersion(void) {
+  return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
+}
+
+//------------------------------------------------------------------------------
+// VP8Decoder
+
+static void SetOk(VP8Decoder* const dec) {
+  dec->status_ = VP8_STATUS_OK;
+  dec->error_msg_ = "OK";
+}
+
+int VP8InitIoInternal(VP8Io* const io, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
+    return 0;  // mismatch error
+  }
+  if (io != NULL) {
+    memset(io, 0, sizeof(*io));
+  }
+  return 1;
+}
+
+VP8Decoder* VP8New(void) {
+  VP8Decoder* const dec = (VP8Decoder*)calloc(1ULL, sizeof(*dec));
+  if (dec != NULL) {
+    SetOk(dec);
+    dec->ready_ = 0;
+    dec->num_parts_ = 1;
+  }
+  return dec;
+}
+
+VP8StatusCode VP8Status(VP8Decoder* const dec) {
+  if (!dec) return VP8_STATUS_INVALID_PARAM;
+  return dec->status_;
+}
+
+const char* VP8StatusMessage(VP8Decoder* const dec) {
+  if (dec == NULL) return "no object";
+  if (!dec->error_msg_) return "OK";
+  return dec->error_msg_;
+}
+
+void VP8Delete(VP8Decoder* const dec) {
+  if (dec != NULL) {
+    VP8Clear(dec);
+    free(dec);
+  }
+}
+
+int VP8SetError(VP8Decoder* const dec,
+                VP8StatusCode error, const char* const msg) {
+  // The oldest error reported takes precedence over the new one.
+  if (dec->status_ == VP8_STATUS_OK) {
+    dec->status_ = error;
+    dec->error_msg_ = msg;
+    dec->ready_ = 0;
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+
+int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
+  return (data_size >= 3 &&
+          data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
+}
+
+int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
+               int* const width, int* const height) {
+  if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
+    return 0;         // not enough data
+  }
+  // check signature
+  if (!VP8CheckSignature(data + 3, data_size - 3)) {
+    return 0;         // Wrong signature.
+  } else {
+    const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
+    const int key_frame = !(bits & 1);
+    const int w = ((data[7] << 8) | data[6]) & 0x3fff;
+    const int h = ((data[9] << 8) | data[8]) & 0x3fff;
+
+    if (!key_frame) {   // Not a keyframe.
+      return 0;
+    }
+
+    if (((bits >> 1) & 7) > 3) {
+      return 0;         // unknown profile
+    }
+    if (!((bits >> 4) & 1)) {
+      return 0;         // first frame is invisible!
+    }
+    if (((bits >> 5)) >= chunk_size) {  // partition_length
+      return 0;         // inconsistent size information.
+    }
+    if (w == 0 || h == 0) {
+      return 0;         // We don't support both width and height to be zero.
+    }
+
+    if (width) {
+      *width = w;
+    }
+    if (height) {
+      *height = h;
+    }
+
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Header parsing
+
+static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
+  assert(hdr != NULL);
+  hdr->use_segment_ = 0;
+  hdr->update_map_ = 0;
+  hdr->absolute_delta_ = 1;
+  memset(hdr->quantizer_, 0, sizeof(hdr->quantizer_));
+  memset(hdr->filter_strength_, 0, sizeof(hdr->filter_strength_));
+}
+
+// Paragraph 9.3
+static int ParseSegmentHeader(VP8BitReader* br,
+                              VP8SegmentHeader* hdr, VP8Proba* proba) {
+  assert(br != NULL);
+  assert(hdr != NULL);
+  hdr->use_segment_ = VP8Get(br);
+  if (hdr->use_segment_) {
+    hdr->update_map_ = VP8Get(br);
+    if (VP8Get(br)) {   // update data
+      int s;
+      hdr->absolute_delta_ = VP8Get(br);
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->quantizer_[s] = VP8Get(br) ? VP8GetSignedValue(br, 7) : 0;
+      }
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        hdr->filter_strength_[s] = VP8Get(br) ? VP8GetSignedValue(br, 6) : 0;
+      }
+    }
+    if (hdr->update_map_) {
+      int s;
+      for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
+        proba->segments_[s] = VP8Get(br) ? VP8GetValue(br, 8) : 255u;
+      }
+    }
+  } else {
+    hdr->update_map_ = 0;
+  }
+  return !br->eof_;
+}
+
+// Paragraph 9.5
+// This function returns VP8_STATUS_SUSPENDED if we don't have all the
+// necessary data in 'buf'.
+// This case is not necessarily an error (for incremental decoding).
+// Still, no bitreader is ever initialized to make it possible to read
+// unavailable memory.
+// If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
+// is returned, and this is an unrecoverable error.
+// If the partitions were positioned ok, VP8_STATUS_OK is returned.
+static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
+                                     const uint8_t* buf, size_t size) {
+  VP8BitReader* const br = &dec->br_;
+  const uint8_t* sz = buf;
+  const uint8_t* buf_end = buf + size;
+  const uint8_t* part_start;
+  int last_part;
+  int p;
+
+  dec->num_parts_ = 1 << VP8GetValue(br, 2);
+  last_part = dec->num_parts_ - 1;
+  part_start = buf + last_part * 3;
+  if (buf_end < part_start) {
+    // we can't even read the sizes with sz[]! That's a failure.
+    return VP8_STATUS_NOT_ENOUGH_DATA;
+  }
+  for (p = 0; p < last_part; ++p) {
+    const uint32_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
+    const uint8_t* part_end = part_start + psize;
+    if (part_end > buf_end) part_end = buf_end;
+    VP8InitBitReader(dec->parts_ + p, part_start, part_end);
+    part_start = part_end;
+    sz += 3;
+  }
+  VP8InitBitReader(dec->parts_ + last_part, part_start, buf_end);
+  return (part_start < buf_end) ? VP8_STATUS_OK :
+           VP8_STATUS_SUSPENDED;   // Init is ok, but there's not enough data
+}
+
+// Paragraph 9.4
+static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
+  VP8FilterHeader* const hdr = &dec->filter_hdr_;
+  hdr->simple_    = VP8Get(br);
+  hdr->level_     = VP8GetValue(br, 6);
+  hdr->sharpness_ = VP8GetValue(br, 3);
+  hdr->use_lf_delta_ = VP8Get(br);
+  if (hdr->use_lf_delta_) {
+    if (VP8Get(br)) {   // update lf-delta?
+      int i;
+      for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+      for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
+        if (VP8Get(br)) {
+          hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6);
+        }
+      }
+    }
+  }
+  dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
+  return !br->eof_;
+}
+
+// Topmost call
+int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
+  const uint8_t* buf;
+  size_t buf_size;
+  VP8FrameHeader* frm_hdr;
+  VP8PictureHeader* pic_hdr;
+  VP8BitReader* br;
+  VP8StatusCode status;
+
+  if (dec == NULL) {
+    return 0;
+  }
+  SetOk(dec);
+  if (io == NULL) {
+    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
+                       "null VP8Io passed to VP8GetHeaders()");
+  }
+  buf = io->data;
+  buf_size = io->data_size;
+  if (buf_size < 4) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "Truncated header.");
+  }
+
+  // Paragraph 9.1
+  {
+    const uint32_t bits = buf[0] | (buf[1] << 8) | (buf[2] << 16);
+    frm_hdr = &dec->frm_hdr_;
+    frm_hdr->key_frame_ = !(bits & 1);
+    frm_hdr->profile_ = (bits >> 1) & 7;
+    frm_hdr->show_ = (bits >> 4) & 1;
+    frm_hdr->partition_length_ = (bits >> 5);
+    if (frm_hdr->profile_ > 3)
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "Incorrect keyframe parameters.");
+    if (!frm_hdr->show_)
+      return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
+                         "Frame not displayable.");
+    buf += 3;
+    buf_size -= 3;
+  }
+
+  pic_hdr = &dec->pic_hdr_;
+  if (frm_hdr->key_frame_) {
+    // Paragraph 9.2
+    if (buf_size < 7) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "cannot parse picture header");
+    }
+    if (!VP8CheckSignature(buf, buf_size)) {
+      return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                         "Bad code word");
+    }
+    pic_hdr->width_ = ((buf[4] << 8) | buf[3]) & 0x3fff;
+    pic_hdr->xscale_ = buf[4] >> 6;   // ratio: 1, 5/4 5/3 or 2
+    pic_hdr->height_ = ((buf[6] << 8) | buf[5]) & 0x3fff;
+    pic_hdr->yscale_ = buf[6] >> 6;
+    buf += 7;
+    buf_size -= 7;
+
+    dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
+    dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
+    // Setup default output area (can be later modified during io->setup())
+    io->width = pic_hdr->width_;
+    io->height = pic_hdr->height_;
+    io->use_scaling  = 0;
+    io->use_cropping = 0;
+    io->crop_top  = 0;
+    io->crop_left = 0;
+    io->crop_right  = io->width;
+    io->crop_bottom = io->height;
+    io->mb_w = io->width;   // sanity check
+    io->mb_h = io->height;  // ditto
+
+    VP8ResetProba(&dec->proba_);
+    ResetSegmentHeader(&dec->segment_hdr_);
+  }
+
+  // Check if we have all the partition #0 available, and initialize dec->br_
+  // to read this partition (and this partition only).
+  if (frm_hdr->partition_length_ > buf_size) {
+    return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                       "bad partition length");
+  }
+
+  br = &dec->br_;
+  VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_);
+  buf += frm_hdr->partition_length_;
+  buf_size -= frm_hdr->partition_length_;
+
+  if (frm_hdr->key_frame_) {
+    pic_hdr->colorspace_ = VP8Get(br);
+    pic_hdr->clamp_type_ = VP8Get(br);
+  }
+  if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "cannot parse segment header");
+  }
+  // Filter specs
+  if (!ParseFilterHeader(br, dec)) {
+    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
+                       "cannot parse filter header");
+  }
+  status = ParsePartitions(dec, buf, buf_size);
+  if (status != VP8_STATUS_OK) {
+    return VP8SetError(dec, status, "cannot parse partitions");
+  }
+
+  // quantizer change
+  VP8ParseQuant(dec);
+
+  // Frame buffer marking
+  if (!frm_hdr->key_frame_) {
+    return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
+                       "Not a key frame.");
+  }
+
+  VP8Get(br);   // ignore the value of update_proba_
+
+  VP8ParseProba(br, dec);
+
+  // sanitized state
+  dec->ready_ = 1;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Residual decoding (Paragraph 13.2 / 13.3)
+
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
+static const uint8_t kCat6[] =
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
+static const uint8_t kZigzag[16] = {
+  0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
+};
+
+// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
+static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
+  int v;
+  if (!VP8GetBit(br, p[3])) {
+    if (!VP8GetBit(br, p[4])) {
+      v = 2;
+    } else {
+      v = 3 + VP8GetBit(br, p[5]);
+    }
+  } else {
+    if (!VP8GetBit(br, p[6])) {
+      if (!VP8GetBit(br, p[7])) {
+        v = 5 + VP8GetBit(br, 159);
+      } else {
+        v = 7 + 2 * VP8GetBit(br, 165);
+        v += VP8GetBit(br, 145);
+      }
+    } else {
+      const uint8_t* tab;
+      const int bit1 = VP8GetBit(br, p[8]);
+      const int bit0 = VP8GetBit(br, p[9 + bit1]);
+      const int cat = 2 * bit1 + bit0;
+      v = 0;
+      for (tab = kCat3456[cat]; *tab; ++tab) {
+        v += v + VP8GetBit(br, *tab);
+      }
+      v += 3 + (8 << cat);
+    }
+  }
+  return v;
+}
+
+// Returns the position of the last non-zero coeff plus one
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
+                     int ctx, const quant_t dq, int n, int16_t* out) {
+  const uint8_t* p = prob[n]->probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBit(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+      p = prob[++n]->probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
+      int v;
+      if (!VP8GetBit(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+    }
+  }
+  return 16;
+}
+
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
+  nz_coeffs <<= 2;
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
+  return nz_coeffs;
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+static int ParseResiduals(VP8Decoder* const dec,
+                          VP8MB* const mb, VP8BitReader* const token_br) {
+  const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
+  const VP8BandProbas* const * ac_proba;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
+  int16_t* dst = block->coeffs_;
+  VP8MB* const left_mb = dec->mb_info_ - 1;
+  uint8_t tnz, lnz;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
+  int x, y, ch;
+  uint32_t out_t_nz, out_l_nz;
+  int first;
+
+  memset(dst, 0, 384 * sizeof(*dst));
+  if (!block->is_i4x4_) {    // parse DC
+    int16_t dc[16] = { 0 };
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
+      TransformWHT(dc, dst);
+    } else {        // only DC is non-zero -> inlined simplified transform
+      int i;
+      const int dc0 = (dc[0] + 3) >> 3;
+      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
+    }
+    first = 1;
+    ac_proba = bands[0];
+  } else {
+    first = 0;
+    ac_proba = bands[3];
+  }
+
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
+  for (y = 0; y < 4; ++y) {
+    int l = lnz & 1;
+    uint32_t nz_coeffs = 0;
+    for (x = 0; x < 4; ++x) {
+      const int ctx = l + (tnz & 1);
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
+      l = (nz > first);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+      dst += 16;
+    }
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
+  }
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;
+
+  for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_coeffs = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
+    for (y = 0; y < 2; ++y) {
+      int l = lnz & 1;
+      for (x = 0; x < 2; ++x) {
+        const int ctx = l + (tnz & 1);
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
+        dst += 16;
+      }
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
+    }
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_uv |= nz_coeffs << (4 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
+  }
+  mb->nz_ = out_t_nz;
+  left_mb->nz_ = out_l_nz;
+
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
+}
+
+//------------------------------------------------------------------------------
+// Main loop
+
+int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int skip = dec->use_skip_proba_ ? block->skip_ : 0;
+
+  if (!skip) {
+    skip = ParseResiduals(dec, mb, token_br);
+  } else {
+    left->nz_ = mb->nz_ = 0;
+    if (!block->is_i4x4_) {
+      left->nz_dc_ = mb->nz_dc_ = 0;
+    }
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
+    block->dither_ = 0;
+  }
+
+  if (dec->filter_type_ > 0) {  // store filter info
+    VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
+    *finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
+    finfo->f_inner_ |= !skip;
+  }
+
+  return !token_br->eof_;
+}
+
+void VP8InitScanline(VP8Decoder* const dec) {
+  VP8MB* const left = dec->mb_info_ - 1;
+  left->nz_ = 0;
+  left->nz_dc_ = 0;
+  memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
+  dec->mb_x_ = 0;
+}
+
+static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
+  for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
+    VP8BitReader* const token_br =
+        &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
+    if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
+      return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                         "Premature end-of-partition0 encountered.");
+    }
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
+      if (!VP8DecodeMB(dec, token_br)) {
+        return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
+                           "Premature end-of-file encountered.");
+      }
+    }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
+    if (!VP8ProcessRow(dec, io)) {
+      return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
+    }
+  }
+  return 1;
+}
+
+// Main entry point
+int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
+  int ok = 0;
+  if (dec == NULL) {
+    return 0;
+  }
+  if (io == NULL) {
+    return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
+                       "NULL VP8Io parameter in VP8Decode().");
+  }
+
+  if (!dec->ready_) {
+    if (!VP8GetHeaders(dec, io)) {
+      return 0;
+    }
+  }
+  assert(dec->ready_);
+
+  // Finish setting up the decoding parameter. Will call io->setup().
+  ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
+  if (ok) {   // good to go.
+    // Will allocate memory and prepare everything.
+    if (ok) ok = VP8InitFrame(dec, io);
+
+    // Main decoding loop
+    if (ok) ok = ParseFrame(dec, io);
+
+    // Exit.
+    ok &= VP8ExitCritical(dec, io);
+  }
+
+  if (!ok) {
+    VP8Clear(dec);
+    return 0;
+  }
+
+  dec->ready_ = 0;
+  return ok;
+}
+
+void VP8Clear(VP8Decoder* const dec) {
+  if (dec == NULL) {
+    return;
+  }
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
+  free(dec->mem_);
+  dec->mem_ = NULL;
+  dec->mem_size_ = 0;
+  memset(&dec->br_, 0, sizeof(dec->br_));
+  dec->ready_ = 0;
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/src/loaders/webp/dec/vp8i.h b/src/loaders/webp/dec/vp8i.h
new file mode 100644
index 00000000..0164f988
--- /dev/null
+++ b/src/loaders/webp/dec/vp8i.h
@@ -0,0 +1,307 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// VP8 decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DEC_VP8I_H_
+#define WEBP_DEC_VP8I_H_
+
+#include <string.h>     // for memcpy()
+#include "./common.h"
+#include "./vp8li.h"
+#include "../utils/bit_reader.h"
+#include "../utils/random.h"
+#include "../dsp/dsp.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Various defines and enums
+
+// version numbers
+#define DEC_MAJ_VERSION 0
+#define DEC_MIN_VERSION 4
+#define DEC_REV_VERSION 3
+
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
+// Constraints are: We need to store one 16x16 block of luma samples (y),
+// and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
+// in order to be SIMD-friendly. We also need to store the top, left and
+// top-left samples (from previously decoded blocks), along with four
+// extra top-right samples for luma (intra4x4 prediction only).
+// One possible layout is, using 32 * (17 + 9) bytes:
+//
+//   .+------   <- only 1 pixel high
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyyt.
+//   .|yyyy..
+//   .+--.+--   <- only 1 pixel high
+//   .|uu.|vv
+//   .|uu.|vv
+//
+// Every character is a 4x4 block, with legend:
+//  '.' = unused
+//  'y' = y-samples   'u' = u-samples     'v' = u-samples
+//  '|' = left sample,   '-' = top sample,    '+' = top-left sample
+//  't' = extra top-right sample for 4x4 modes
+#define YUV_SIZE (BPS * 17 + BPS * 9)
+#define Y_SIZE   (BPS * 17)
+#define Y_OFF    (BPS * 1 + 8)
+#define U_OFF    (Y_OFF + BPS * 16 + BPS)
+#define V_OFF    (U_OFF + 16)
+
+// minimal width under which lossy multi-threading is always disabled
+#define MIN_WIDTH_FOR_THREADS 512
+
+//------------------------------------------------------------------------------
+// Headers
+
+typedef struct {
+  uint8_t key_frame_;
+  uint8_t profile_;
+  uint8_t show_;
+  uint32_t partition_length_;
+} VP8FrameHeader;
+
+typedef struct {
+  uint16_t width_;
+  uint16_t height_;
+  uint8_t xscale_;
+  uint8_t yscale_;
+  uint8_t colorspace_;   // 0 = YCbCr
+  uint8_t clamp_type_;
+} VP8PictureHeader;
+
+// segment features
+typedef struct {
+  int use_segment_;
+  int update_map_;        // whether to update the segment map or not
+  int absolute_delta_;    // absolute or delta values for quantizer and filter
+  int8_t quantizer_[NUM_MB_SEGMENTS];        // quantization changes
+  int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
+} VP8SegmentHeader;
+
+// probas associated to one of the contexts
+typedef uint8_t VP8ProbaArray[NUM_PROBAS];
+
+typedef struct {   // all the probas associated to one band
+  VP8ProbaArray probas_[NUM_CTX];
+} VP8BandProbas;
+
+// Struct collecting all frame-persistent probabilities.
+typedef struct {
+  uint8_t segments_[MB_FEATURE_TREE_PROBS];
+  // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
+  const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
+} VP8Proba;
+
+// Filter parameters
+typedef struct {
+  int simple_;                  // 0=complex, 1=simple
+  int level_;                   // [0..63]
+  int sharpness_;               // [0..7]
+  int use_lf_delta_;
+  int ref_lf_delta_[NUM_REF_LF_DELTAS];
+  int mode_lf_delta_[NUM_MODE_LF_DELTAS];
+} VP8FilterHeader;
+
+//------------------------------------------------------------------------------
+// Informations about the macroblocks.
+
+typedef struct {  // filter specs
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
+  uint8_t f_ilevel_;     // inner limit in [1..63]
+  uint8_t f_inner_;      // do inner filtering?
+  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
+} VP8FInfo;
+
+typedef struct {  // Top/Left Contexts used for syntax-parsing
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
+} VP8MB;
+
+// Dequantization matrices
+typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
+typedef struct {
+  quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
+} VP8QuantMatrix;
+
+// Data needed to reconstruct a macroblock
+typedef struct {
+  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+  uint8_t skip_;
+  uint8_t segment_;
+} VP8MBData;
+
+// Persistent information needed by the parallel processing
+typedef struct {
+  int id_;              // cache row to process (in [0..2])
+  int mb_y_;            // macroblock position of the row
+  int filter_row_;      // true if row-filtering is needed
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
+  VP8Io io_;            // copy of the VP8Io to pass to put()
+} VP8ThreadContext;
+
+// Saved top samples, per macroblock. Fits into a cache-line.
+typedef struct {
+  uint8_t y[16], u[8], v[8];
+} VP8TopSamples;
+
+//------------------------------------------------------------------------------
+// VP8Decoder: the main opaque structure handed over to user
+
+struct VP8Decoder {
+  VP8StatusCode status_;
+  int ready_;     // true if ready to decode a picture with VP8Decode()
+  const char* error_msg_;  // set when status_ is not OK.
+
+  // Main data source
+  VP8BitReader br_;
+
+  // headers
+  VP8FrameHeader   frm_hdr_;
+  VP8PictureHeader pic_hdr_;
+  VP8FilterHeader  filter_hdr_;
+  VP8SegmentHeader segment_hdr_;
+
+  int cache_id_;       // current cache row
+  int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
+  VP8ThreadContext thread_ctx_;  // Thread context
+
+  // dimension, in macroblock units.
+  int mb_w_, mb_h_;
+
+  // Macroblock to process/filter, depending on cropping and filter_type.
+  int tl_mb_x_, tl_mb_y_;  // top-left MB that must be in-loop filtered
+  int br_mb_x_, br_mb_y_;  // last bottom-right MB that must be decoded
+
+  // number of partitions.
+  int num_parts_;
+  // per-partition boolean decoders.
+  VP8BitReader parts_[MAX_NUM_PARTITIONS];
+
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering
+
+  // dequantization (one set of DC/AC dequant factor per segment)
+  VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
+
+  // probabilities
+  VP8Proba proba_;
+  int use_skip_proba_;
+  uint8_t skip_p_;
+
+  // Boundary data cache and persistent buffers.
+  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];   // left intra modes values
+
+  VP8TopSamples* yuv_t_;  // top y/u/v samples
+
+  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;      // filter strength info
+  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
+
+  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
+  uint8_t* cache_u_;
+  uint8_t* cache_v_;
+  int cache_y_stride_;
+  int cache_uv_stride_;
+
+  // main memory chunk for the above data. Persistent.
+  void* mem_;
+  size_t mem_size_;
+
+  // Per macroblock non-persistent infos.
+  int mb_x_, mb_y_;       // current position, in macroblock units
+  VP8MBData* mb_data_;    // parsed reconstruction data
+
+  // Filtering side-info
+  int filter_type_;                          // 0=off, 1=simple, 2=complex
+  VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
+
+  // Alpha
+  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
+  const uint8_t* alpha_data_;     // compressed alpha data (if present)
+  size_t alpha_data_size_;
+  int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
+  uint8_t* alpha_plane_;  // output. Persistent, contains the whole data.
+  int alpha_dithering_;   // derived from decoding options (0=off, 100=full).
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// in vp8.c
+int VP8SetError(VP8Decoder* const dec,
+                VP8StatusCode error, const char* const msg);
+
+// in tree.c
+void VP8ResetProba(VP8Proba* const proba);
+void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
+// parses one row of intra mode data in partition 0, returns !eof
+int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
+
+// in quant.c
+void VP8ParseQuant(VP8Decoder* const dec);
+
+// in frame.c
+int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
+// Call io->setup() and finish setting up scan parameters.
+// After this call returns, one must always call VP8ExitCritical() with the
+// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
+// if ok, otherwise sets and returns the error status on *dec.
+VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
+// Must always be called in pair with VP8EnterCritical().
+// Returns false in case of error.
+int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
+int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
+// To be called at the start of a new scanline, to initialize predictors.
+void VP8InitScanline(VP8Decoder* const dec);
+// Decode one macroblock. Returns false if there is not enough data.
+int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
+
+// in alpha.c
+const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
+                                      int row, int num_rows);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8I_H_ */
diff --git a/src/loaders/webp/dec/vp8l.cpp b/src/loaders/webp/dec/vp8l.cpp
new file mode 100644
index 00000000..8ddeb7ef
--- /dev/null
+++ b/src/loaders/webp/dec/vp8l.cpp
@@ -0,0 +1,1580 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// main entry for the decoder
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#include <stdlib.h>
+
+#include "./alphai.h"
+#include "./vp8li.h"
+#include "../dsp/dsp.h"
+#include "../dsp/lossless.h"
+#include "../dsp/yuv.h"
+#include "../utils/huffman.h"
+#include "../utils/utils.h"
+
+#define NUM_ARGB_CACHE_ROWS          16
+
+static const int kCodeLengthLiterals = 16;
+static const int kCodeLengthRepeatCode = 16;
+static const int kCodeLengthExtraBits[3] = { 2, 3, 7 };
+static const int kCodeLengthRepeatOffsets[3] = { 3, 3, 11 };
+
+// -----------------------------------------------------------------------------
+//  Five Huffman codes are used at each meta code:
+//  1. green + length prefix codes + color cache codes,
+//  2. alpha,
+//  3. red,
+//  4. blue, and,
+//  5. distance prefix codes.
+typedef enum {
+  GREEN = 0,
+  RED   = 1,
+  BLUE  = 2,
+  ALPHA = 3,
+  DIST  = 4
+} HuffIndex;
+
+static const uint16_t kAlphabetSize[HUFFMAN_CODES_PER_META_CODE] = {
+  NUM_LITERAL_CODES + NUM_LENGTH_CODES,
+  NUM_LITERAL_CODES, NUM_LITERAL_CODES, NUM_LITERAL_CODES,
+  NUM_DISTANCE_CODES
+};
+
+static const uint8_t kLiteralMap[HUFFMAN_CODES_PER_META_CODE] = {
+  0, 1, 1, 1, 0
+};
+
+#define NUM_CODE_LENGTH_CODES       19
+static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
+  17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+#define CODE_TO_PLANE_CODES        120
+static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
+  0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
+  0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
+  0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
+  0x46, 0x4a, 0x24, 0x2c, 0x58, 0x45, 0x4b, 0x34, 0x3c, 0x03,
+  0x57, 0x59, 0x13, 0x1d, 0x56, 0x5a, 0x23, 0x2d, 0x44, 0x4c,
+  0x55, 0x5b, 0x33, 0x3d, 0x68, 0x02, 0x67, 0x69, 0x12, 0x1e,
+  0x66, 0x6a, 0x22, 0x2e, 0x54, 0x5c, 0x43, 0x4d, 0x65, 0x6b,
+  0x32, 0x3e, 0x78, 0x01, 0x77, 0x79, 0x53, 0x5d, 0x11, 0x1f,
+  0x64, 0x6c, 0x42, 0x4e, 0x76, 0x7a, 0x21, 0x2f, 0x75, 0x7b,
+  0x31, 0x3f, 0x63, 0x6d, 0x52, 0x5e, 0x00, 0x74, 0x7c, 0x41,
+  0x4f, 0x10, 0x20, 0x62, 0x6e, 0x30, 0x73, 0x7d, 0x51, 0x5f,
+  0x40, 0x72, 0x7e, 0x61, 0x6f, 0x50, 0x71, 0x7f, 0x60, 0x70
+};
+
+// Memory needed for lookup tables of one Huffman tree group. Red, blue, alpha
+// and distance alphabets are constant (256 for red, blue and alpha, 40 for
+// distance) and lookup table sizes for them in worst case are 630 and 410
+// respectively. Size of green alphabet depends on color cache size and is equal
+// to 256 (green component values) + 24 (length prefix values)
+// + color_cache_size (between 0 and 2048).
+// All values computed for 8-bit first level lookup with Mark Adler's tool:
+// http://www.hdfgroup.org/ftp/lib-external/zlib/zlib-1.2.5/examples/enough.c
+#define FIXED_TABLE_SIZE (630 * 3 + 410)
+static const int kTableSize[12] = {
+  FIXED_TABLE_SIZE + 654,
+  FIXED_TABLE_SIZE + 656,
+  FIXED_TABLE_SIZE + 658,
+  FIXED_TABLE_SIZE + 662,
+  FIXED_TABLE_SIZE + 670,
+  FIXED_TABLE_SIZE + 686,
+  FIXED_TABLE_SIZE + 718,
+  FIXED_TABLE_SIZE + 782,
+  FIXED_TABLE_SIZE + 912,
+  FIXED_TABLE_SIZE + 1168,
+  FIXED_TABLE_SIZE + 1680,
+  FIXED_TABLE_SIZE + 2704
+};
+
+static int DecodeImageStream(int xsize, int ysize,
+                             int is_level0,
+                             VP8LDecoder* const dec,
+                             uint32_t** const decoded_data);
+
+//------------------------------------------------------------------------------
+
+int VP8LCheckSignature(const uint8_t* const data, size_t size) {
+  return (size >= VP8L_FRAME_HEADER_SIZE &&
+          data[0] == VP8L_MAGIC_BYTE &&
+          (data[4] >> 5) == 0);  // version
+}
+
+static int ReadImageInfo(VP8LBitReader* const br,
+                         int* const width, int* const height,
+                         int* const has_alpha) {
+  if (VP8LReadBits(br, 8) != VP8L_MAGIC_BYTE) return 0;
+  *width = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
+  *height = VP8LReadBits(br, VP8L_IMAGE_SIZE_BITS) + 1;
+  *has_alpha = VP8LReadBits(br, 1);
+  if (VP8LReadBits(br, VP8L_VERSION_BITS) != 0) return 0;
+  return !br->eos_;
+}
+
+int VP8LGetInfo(const uint8_t* data, size_t data_size,
+                int* const width, int* const height, int* const has_alpha) {
+  if (data == NULL || data_size < VP8L_FRAME_HEADER_SIZE) {
+    return 0;         // not enough data
+  } else if (!VP8LCheckSignature(data, data_size)) {
+    return 0;         // bad signature
+  } else {
+    int w, h, a;
+    VP8LBitReader br;
+    VP8LInitBitReader(&br, data, data_size);
+    if (!ReadImageInfo(&br, &w, &h, &a)) {
+      return 0;
+    }
+    if (width != NULL) *width = w;
+    if (height != NULL) *height = h;
+    if (has_alpha != NULL) *has_alpha = a;
+    return 1;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int GetCopyDistance(int distance_symbol,
+                                       VP8LBitReader* const br) {
+  int extra_bits, offset;
+  if (distance_symbol < 4) {
+    return distance_symbol + 1;
+  }
+  extra_bits = (distance_symbol - 2) >> 1;
+  offset = (2 + (distance_symbol & 1)) << extra_bits;
+  return offset + VP8LReadBits(br, extra_bits) + 1;
+}
+
+static WEBP_INLINE int GetCopyLength(int length_symbol,
+                                     VP8LBitReader* const br) {
+  // Length and distance prefixes are encoded the same way.
+  return GetCopyDistance(length_symbol, br);
+}
+
+static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
+  if (plane_code > CODE_TO_PLANE_CODES) {
+    return plane_code - CODE_TO_PLANE_CODES;
+  } else {
+    const int dist_code = kCodeToPlane[plane_code - 1];
+    const int yoffset = dist_code >> 4;
+    const int xoffset = 8 - (dist_code & 0xf);
+    const int dist = yoffset * xsize + xoffset;
+    return (dist >= 1) ? dist : 1;  // dist<1 can happen if xsize is very small
+  }
+}
+
+//------------------------------------------------------------------------------
+// Decodes the next Huffman code from bit-stream.
+// FillBitWindow(br) needs to be called at minimum every second call
+// to ReadSymbol, in order to pre-fetch enough bits.
+static WEBP_INLINE int ReadSymbol(const HuffmanCode* table,
+                                  VP8LBitReader* const br) {
+  int nbits;
+  uint32_t val = VP8LPrefetchBits(br);
+  table += val & HUFFMAN_TABLE_MASK;
+  nbits = table->bits - HUFFMAN_TABLE_BITS;
+  if (nbits > 0) {
+    VP8LSetBitPos(br, br->bit_pos_ + HUFFMAN_TABLE_BITS);
+    val = VP8LPrefetchBits(br);
+    table += table->value;
+    table += val & ((1 << nbits) - 1);
+  }
+  VP8LSetBitPos(br, br->bit_pos_ + table->bits);
+  return table->value;
+}
+
+static int ReadHuffmanCodeLengths(
+    VP8LDecoder* const dec, const int* const code_length_code_lengths,
+    int num_symbols, int* const code_lengths) {
+  int ok = 0;
+  VP8LBitReader* const br = &dec->br_;
+  int symbol;
+  int max_symbol;
+  int prev_code_len = DEFAULT_CODE_LENGTH;
+  HuffmanCode table[1 << LENGTHS_TABLE_BITS];
+
+  if (!VP8LBuildHuffmanTable(table, LENGTHS_TABLE_BITS,
+                             code_length_code_lengths,
+                             NUM_CODE_LENGTH_CODES)) {
+    goto End;
+  }
+
+  if (VP8LReadBits(br, 1)) {    // use length
+    const int length_nbits = 2 + 2 * VP8LReadBits(br, 3);
+    max_symbol = 2 + VP8LReadBits(br, length_nbits);
+    if (max_symbol > num_symbols) {
+      goto End;
+    }
+  } else {
+    max_symbol = num_symbols;
+  }
+
+  symbol = 0;
+  while (symbol < num_symbols) {
+    const HuffmanCode* p;
+    int code_len;
+    if (max_symbol-- == 0) break;
+    VP8LFillBitWindow(br);
+    p = &table[VP8LPrefetchBits(br) & LENGTHS_TABLE_MASK];
+    VP8LSetBitPos(br, br->bit_pos_ + p->bits);
+    code_len = p->value;
+    if (code_len < kCodeLengthLiterals) {
+      code_lengths[symbol++] = code_len;
+      if (code_len != 0) prev_code_len = code_len;
+    } else {
+      const int use_prev = (code_len == kCodeLengthRepeatCode);
+      const int slot = code_len - kCodeLengthLiterals;
+      const int extra_bits = kCodeLengthExtraBits[slot];
+      const int repeat_offset = kCodeLengthRepeatOffsets[slot];
+      int repeat = VP8LReadBits(br, extra_bits) + repeat_offset;
+      if (symbol + repeat > num_symbols) {
+        goto End;
+      } else {
+        const int length = use_prev ? prev_code_len : 0;
+        while (repeat-- > 0) code_lengths[symbol++] = length;
+      }
+    }
+  }
+  ok = 1;
+
+ End:
+  if (!ok) dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  return ok;
+}
+
+// 'code_lengths' is pre-allocated temporary buffer, used for creating Huffman
+// tree.
+static int ReadHuffmanCode(int alphabet_size, VP8LDecoder* const dec,
+                           int* const code_lengths, HuffmanCode* const table) {
+  int ok = 0;
+  int size = 0;
+  VP8LBitReader* const br = &dec->br_;
+  const int simple_code = VP8LReadBits(br, 1);
+
+  memset(code_lengths, 0, alphabet_size * sizeof(*code_lengths));
+
+  if (simple_code) {  // Read symbols, codes & code lengths directly.
+    const int num_symbols = VP8LReadBits(br, 1) + 1;
+    const int first_symbol_len_code = VP8LReadBits(br, 1);
+    // The first code is either 1 bit or 8 bit code.
+    int symbol = VP8LReadBits(br, (first_symbol_len_code == 0) ? 1 : 8);
+    code_lengths[symbol] = 1;
+    // The second code (if present), is always 8 bit long.
+    if (num_symbols == 2) {
+      symbol = VP8LReadBits(br, 8);
+      code_lengths[symbol] = 1;
+    }
+    ok = 1;
+  } else {  // Decode Huffman-coded code lengths.
+    int i;
+    int code_length_code_lengths[NUM_CODE_LENGTH_CODES] = { 0 };
+    const int num_codes = VP8LReadBits(br, 4) + 4;
+    if (num_codes > NUM_CODE_LENGTH_CODES) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      return 0;
+    }
+
+    for (i = 0; i < num_codes; ++i) {
+      code_length_code_lengths[kCodeLengthCodeOrder[i]] = VP8LReadBits(br, 3);
+    }
+    ok = ReadHuffmanCodeLengths(dec, code_length_code_lengths, alphabet_size,
+                                code_lengths);
+  }
+
+  ok = ok && !br->eos_;
+  if (ok) {
+    size = VP8LBuildHuffmanTable(table, HUFFMAN_TABLE_BITS,
+                                 code_lengths, alphabet_size);
+  }
+  if (!ok || size == 0) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    return 0;
+  }
+  return size;
+}
+
+static int ReadHuffmanCodes(VP8LDecoder* const dec, int xsize, int ysize,
+                            int color_cache_bits, int allow_recursion) {
+  int i, j;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  uint32_t* huffman_image = NULL;
+  HTreeGroup* htree_groups = NULL;
+  HuffmanCode* huffman_tables = NULL;
+  HuffmanCode* next = NULL;
+  int num_htree_groups = 1;
+  int max_alphabet_size = 0;
+  int* code_lengths = NULL;
+  const int table_size = kTableSize[color_cache_bits];
+
+  if (allow_recursion && VP8LReadBits(br, 1)) {
+    // use meta Huffman codes.
+    const int huffman_precision = VP8LReadBits(br, 3) + 2;
+    const int huffman_xsize = VP8LSubSampleSize(xsize, huffman_precision);
+    const int huffman_ysize = VP8LSubSampleSize(ysize, huffman_precision);
+    const int huffman_pixs = huffman_xsize * huffman_ysize;
+    if (!DecodeImageStream(huffman_xsize, huffman_ysize, 0, dec,
+                           &huffman_image)) {
+      goto Error;
+    }
+    hdr->huffman_subsample_bits_ = huffman_precision;
+    for (i = 0; i < huffman_pixs; ++i) {
+      // The huffman data is stored in red and green bytes.
+      const int group = (huffman_image[i] >> 8) & 0xffff;
+      huffman_image[i] = group;
+      if (group >= num_htree_groups) {
+        num_htree_groups = group + 1;
+      }
+    }
+  }
+
+  if (br->eos_) goto Error;
+
+  // Find maximum alphabet size for the htree group.
+  for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+    int alphabet_size = kAlphabetSize[j];
+    if (j == 0 && color_cache_bits > 0) {
+      alphabet_size += 1 << color_cache_bits;
+    }
+    if (max_alphabet_size < alphabet_size) {
+      max_alphabet_size = alphabet_size;
+    }
+  }
+
+  huffman_tables = (HuffmanCode*)malloc(num_htree_groups * table_size * sizeof(*huffman_tables));
+  htree_groups = VP8LHtreeGroupsNew(num_htree_groups);
+  code_lengths = (int*)calloc((uint64_t)max_alphabet_size, sizeof(*code_lengths));
+
+  if (htree_groups == NULL || code_lengths == NULL || huffman_tables == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  next = huffman_tables;
+  for (i = 0; i < num_htree_groups; ++i) {
+    HTreeGroup* const htree_group = &htree_groups[i];
+    HuffmanCode** const htrees = htree_group->htrees;
+    int size;
+    int is_trivial_literal = 1;
+    for (j = 0; j < HUFFMAN_CODES_PER_META_CODE; ++j) {
+      int alphabet_size = kAlphabetSize[j];
+      htrees[j] = next;
+      if (j == 0 && color_cache_bits > 0) {
+        alphabet_size += 1 << color_cache_bits;
+      }
+      size = ReadHuffmanCode(alphabet_size, dec, code_lengths, next);
+      if (is_trivial_literal && kLiteralMap[j] == 1) {
+        is_trivial_literal = (next->bits == 0);
+      }
+      next += size;
+      if (size == 0) {
+        goto Error;
+      }
+    }
+    htree_group->is_trivial_literal = is_trivial_literal;
+    if (is_trivial_literal) {
+      const int red = htrees[RED][0].value;
+      const int blue = htrees[BLUE][0].value;
+      const int alpha = htrees[ALPHA][0].value;
+      htree_group->literal_arb =
+          ((uint32_t)alpha << 24) | (red << 16) | blue;
+    }
+  }
+  free(code_lengths);
+
+  // All OK. Finalize pointers and return.
+  hdr->huffman_image_ = huffman_image;
+  hdr->num_htree_groups_ = num_htree_groups;
+  hdr->htree_groups_ = htree_groups;
+  hdr->huffman_tables_ = huffman_tables;
+  return 1;
+
+ Error:
+  free(code_lengths);
+  free(huffman_image);
+  free(huffman_tables);
+  VP8LHtreeGroupsFree(htree_groups);
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Scaling.
+
+static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
+  const int num_channels = 4;
+  const int in_width = io->mb_w;
+  const int out_width = io->scaled_width;
+  const int in_height = io->mb_h;
+  const int out_height = io->scaled_height;
+  const uint64_t work_size = 2 * num_channels * (uint64_t)out_width;
+  int32_t* work;        // Rescaler work area.
+  const uint64_t scaled_data_size = num_channels * (uint64_t)out_width;
+  uint32_t* scaled_data;  // Temporary storage for scaled BGRA data.
+  const uint64_t memory_size = sizeof(*dec->rescaler) +
+                               work_size * sizeof(*work) +
+                               scaled_data_size * sizeof(*scaled_data);
+  uint8_t* memory = (uint8_t*)calloc(memory_size, sizeof(*memory));
+  if (memory == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
+  assert(dec->rescaler_memory == NULL);
+  dec->rescaler_memory = memory;
+
+  dec->rescaler = (WebPRescaler*)memory;
+  memory += sizeof(*dec->rescaler);
+  work = (int32_t*)memory;
+  memory += work_size * sizeof(*work);
+  scaled_data = (uint32_t*)memory;
+
+  WebPRescalerInit(dec->rescaler, in_width, in_height, (uint8_t*)scaled_data,
+                   out_width, out_height, 0, num_channels,
+                   in_width, out_width, in_height, out_height, work);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Export to ARGB
+
+// We have special "export" function since we need to convert from BGRA
+static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
+                  int rgba_stride, uint8_t* const rgba) {
+  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const int dst_width = rescaler->dst_width;
+  int num_lines_out = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    uint8_t* const dst = rgba + num_lines_out * rgba_stride;
+    WebPRescalerExportRow(rescaler, 0);
+    WebPMultARGBRow(src, dst_width, 1);
+    VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+// Emit scaled rows.
+static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
+                                uint8_t* in, int in_stride, int mb_h,
+                                uint8_t* const out, int out_stride) {
+  const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
+  int num_lines_in = 0;
+  int num_lines_out = 0;
+  while (num_lines_in < mb_h) {
+    uint8_t* const row_in = in + num_lines_in * in_stride;
+    uint8_t* const row_out = out + num_lines_out * out_stride;
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    assert(needed_lines > 0 && needed_lines <= lines_left);
+    WebPMultARGBRows(row_in, in_stride,
+                     dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
+    num_lines_in += needed_lines;
+    num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
+  }
+  return num_lines_out;
+}
+
+// Emit rows without any scaling.
+static int EmitRows(WEBP_CSP_MODE colorspace,
+                    const uint8_t* row_in, int in_stride,
+                    int mb_w, int mb_h,
+                    uint8_t* const out, int out_stride) {
+  int lines = mb_h;
+  uint8_t* row_out = out;
+  while (lines-- > 0) {
+    VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
+    row_in += in_stride;
+    row_out += out_stride;
+  }
+  return mb_h;  // Num rows out == num rows in.
+}
+
+//------------------------------------------------------------------------------
+// Export to YUVA
+
+// TODO(skal): should be in yuv.c
+static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
+                          const WebPDecBuffer* const output) {
+  const WebPYUVABuffer* const buf = &output->u.YUVA;
+  // first, the luma plane
+  {
+    int i;
+    uint8_t* const y = buf->y + y_pos * buf->y_stride;
+    for (i = 0; i < width; ++i) {
+      const uint32_t p = src[i];
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
+                       YUV_HALF);
+    }
+  }
+
+  // then U/V planes
+  {
+    uint8_t* const u = buf->u + (y_pos >> 1) * buf->u_stride;
+    uint8_t* const v = buf->v + (y_pos >> 1) * buf->v_stride;
+    const int uv_width = width >> 1;
+    int i;
+    for (i = 0; i < uv_width; ++i) {
+      const uint32_t v0 = src[2 * i + 0];
+      const uint32_t v1 = src[2 * i + 1];
+      // VP8RGBToU/V expects four accumulated pixels. Hence we need to
+      // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
+      const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
+      const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
+      const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
+      if (!(y_pos & 1)) {  // even lines: store values
+        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+      } else {             // odd lines: average with previous values
+        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        // Approximated average-of-four. But it's an acceptable diff.
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+    if (width & 1) {       // last pixel
+      const uint32_t v0 = src[2 * i + 0];
+      const int r = (v0 >> 14) & 0x3fc;
+      const int g = (v0 >>  6) & 0x3fc;
+      const int b = (v0 <<  2) & 0x3fc;
+      if (!(y_pos & 1)) {  // even lines
+        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+      } else {             // odd lines (note: we could just skip this)
+        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+        u[i] = (u[i] + tmp_u + 1) >> 1;
+        v[i] = (v[i] + tmp_v + 1) >> 1;
+      }
+    }
+  }
+  // Lastly, store alpha if needed.
+  if (buf->a != NULL) {
+    int i;
+    uint8_t* const a = buf->a + y_pos * buf->a_stride;
+    for (i = 0; i < width; ++i) a[i] = (src[i] >> 24);
+  }
+}
+
+static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
+  WebPRescaler* const rescaler = dec->rescaler;
+  uint32_t* const src = (uint32_t*)rescaler->dst;
+  const int dst_width = rescaler->dst_width;
+  int num_lines_out = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    WebPRescalerExportRow(rescaler, 0);
+    WebPMultARGBRow(src, dst_width, 1);
+    ConvertToYUVA(src, dst_width, y_pos, dec->output_);
+    ++y_pos;
+    ++num_lines_out;
+  }
+  return num_lines_out;
+}
+
+static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
+                                uint8_t* in, int in_stride, int mb_h) {
+  int num_lines_in = 0;
+  int y_pos = dec->last_out_row_;
+  while (num_lines_in < mb_h) {
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
+    num_lines_in += needed_lines;
+    in += needed_lines * in_stride;
+    y_pos += ExportYUVA(dec, y_pos);
+  }
+  return y_pos;
+}
+
+static int EmitRowsYUVA(const VP8LDecoder* const dec,
+                        const uint8_t* in, int in_stride,
+                        int mb_w, int num_rows) {
+  int y_pos = dec->last_out_row_;
+  while (num_rows-- > 0) {
+    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output_);
+    in += in_stride;
+    ++y_pos;
+  }
+  return y_pos;
+}
+
+//------------------------------------------------------------------------------
+// Cropping.
+
+// Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
+// crop options. Also updates the input data pointer, so that it points to the
+// start of the cropped window. Note that pixels are in ARGB format even if
+// 'in_data' is uint8_t*.
+// Returns true if the crop window is not empty.
+static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
+                         uint8_t** const in_data, int pixel_stride) {
+  assert(y_start < y_end);
+  assert(io->crop_left < io->crop_right);
+  if (y_end > io->crop_bottom) {
+    y_end = io->crop_bottom;  // make sure we don't overflow on last row.
+  }
+  if (y_start < io->crop_top) {
+    const int delta = io->crop_top - y_start;
+    y_start = io->crop_top;
+    *in_data += delta * pixel_stride;
+  }
+  if (y_start >= y_end) return 0;  // Crop window is empty.
+
+  *in_data += io->crop_left * sizeof(uint32_t);
+
+  io->mb_y = y_start - io->crop_top;
+  io->mb_w = io->crop_right - io->crop_left;
+  io->mb_h = y_end - y_start;
+  return 1;  // Non-empty crop window.
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int GetMetaIndex(
+    const uint32_t* const image, int xsize, int bits, int x, int y) {
+  if (bits == 0) return 0;
+  return image[xsize * (y >> bits) + (x >> bits)];
+}
+
+static WEBP_INLINE HTreeGroup* GetHtreeGroupForPos(VP8LMetadata* const hdr,
+                                                   int x, int y) {
+  const int meta_index = GetMetaIndex(hdr->huffman_image_, hdr->huffman_xsize_,
+                                      hdr->huffman_subsample_bits_, x, y);
+  assert(meta_index < hdr->num_htree_groups_);
+  return hdr->htree_groups_ + meta_index;
+}
+
+//------------------------------------------------------------------------------
+// Main loop, with custom row-processing function
+
+typedef void (*ProcessRowsFunc)(VP8LDecoder* const dec, int row);
+
+static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
+                                   const uint32_t* const rows) {
+  int n = dec->next_transform_;
+  const int cache_pixs = dec->width_ * num_rows;
+  const int start_row = dec->last_row_;
+  const int end_row = start_row + num_rows;
+  const uint32_t* rows_in = rows;
+  uint32_t* const rows_out = dec->argb_cache_;
+
+  // Inverse transforms.
+  // TODO: most transforms only need to operate on the cropped region only.
+  memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
+  while (n-- > 0) {
+    VP8LTransform* const transform = &dec->transforms_[n];
+    VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
+    rows_in = rows_out;
+  }
+}
+
+// Special method for paletted alpha data.
+static void ApplyInverseTransformsAlpha(VP8LDecoder* const dec, int num_rows,
+                                        const uint8_t* const rows) {
+  const int start_row = dec->last_row_;
+  const int end_row = start_row + num_rows;
+  const uint8_t* rows_in = rows;
+  uint8_t* rows_out = (uint8_t*)dec->io_->opaque + dec->io_->width * start_row;
+  VP8LTransform* const transform = &dec->transforms_[0];
+  assert(dec->next_transform_ == 1);
+  assert(transform->type_ == COLOR_INDEXING_TRANSFORM);
+  VP8LColorIndexInverseTransformAlpha(transform, start_row, end_row, rows_in,
+                                      rows_out);
+}
+
+// Processes (transforms, scales & color-converts) the rows decoded after the
+// last call.
+static void ProcessRows(VP8LDecoder* const dec, int row) {
+  const uint32_t* const rows = dec->pixels_ + dec->width_ * dec->last_row_;
+  const int num_rows = row - dec->last_row_;
+
+  if (num_rows <= 0) return;  // Nothing to be done.
+  ApplyInverseTransforms(dec, num_rows, rows);
+
+  // Emit output.
+  {
+    VP8Io* const io = dec->io_;
+    uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
+    const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
+      // Nothing to output (this time).
+    } else {
+      const WebPDecBuffer* const output = dec->output_;
+      if (output->colorspace < MODE_YUV) {  // convert to RGBA
+        const WebPRGBABuffer* const buf = &output->u.RGBA;
+        uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
+        const int num_rows_out = io->use_scaling ?
+            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
+                                 rgba, buf->stride) :
+            EmitRows(output->colorspace, rows_data, in_stride,
+                     io->mb_w, io->mb_h, rgba, buf->stride);
+        // Update 'last_out_row_'.
+        dec->last_out_row_ += num_rows_out;
+      } else {                              // convert to YUVA
+        dec->last_out_row_ = io->use_scaling ?
+            EmitRescaledRowsYUVA(dec, rows_data, in_stride, io->mb_h) :
+            EmitRowsYUVA(dec, rows_data, in_stride, io->mb_w, io->mb_h);
+      }
+      assert(dec->last_out_row_ <= output->height);
+    }
+  }
+
+  // Update 'last_row_'.
+  dec->last_row_ = row;
+  assert(dec->last_row_ <= dec->height_);
+}
+
+// Row-processing for the special case when alpha data contains only one
+// transform (color indexing), and trivial non-green literals.
+static int Is8bOptimizable(const VP8LMetadata* const hdr) {
+  int i;
+  if (hdr->color_cache_size_ > 0) return 0;
+  // When the Huffman tree contains only one symbol, we can skip the
+  // call to ReadSymbol() for red/blue/alpha channels.
+  for (i = 0; i < hdr->num_htree_groups_; ++i) {
+    HuffmanCode** const htrees = hdr->htree_groups_[i].htrees;
+    if (htrees[RED][0].bits > 0) return 0;
+    if (htrees[BLUE][0].bits > 0) return 0;
+    if (htrees[ALPHA][0].bits > 0) return 0;
+  }
+  return 1;
+}
+
+static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint8_t* const in =
+      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
+  if (num_rows > 0) {
+    ApplyInverseTransformsAlpha(dec, num_rows, in);
+  }
+  dec->last_row_ = dec->last_out_row_ = row;
+}
+
+//------------------------------------------------------------------------------
+// Helper functions for fast pattern copy (8b and 32b)
+
+// cyclic rotation of pattern word
+static WEBP_INLINE uint32_t Rotate8b(uint32_t V) {
+#if defined(WORDS_BIGENDIAN)
+  return ((V & 0xff000000u) >> 24) | (V << 8);
+#else
+  return ((V & 0xffu) << 24) | (V >> 8);
+#endif
+}
+
+// copy 1, 2 or 4-bytes pattern
+static WEBP_INLINE void CopySmallPattern8b(const uint8_t* src, uint8_t* dst,
+                                           int length, uint32_t pattern) {
+  int i;
+  // align 'dst' to 4-bytes boundary. Adjust the pattern along the way.
+  while ((uintptr_t)dst & 3) {
+    *dst++ = *src++;
+    pattern = Rotate8b(pattern);
+    --length;
+  }
+  // Copy the pattern 4 bytes at a time.
+  for (i = 0; i < (length >> 2); ++i) {
+    ((uint32_t*)dst)[i] = pattern;
+  }
+  // Finish with left-overs. 'pattern' is still correctly positioned,
+  // so no Rotate8b() call is needed.
+  for (i <<= 2; i < length; ++i) {
+    dst[i] = src[i];
+  }
+}
+
+static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
+  const uint8_t* src = dst - dist;
+  if (length >= 8) {
+    uint32_t pattern = 0;
+    switch (dist) {
+      case 1:
+        pattern = src[0];
+#if defined(__arm__) || defined(_M_ARM)   // arm doesn't like multiply that much
+        pattern |= pattern << 8;
+        pattern |= pattern << 16;
+#elif defined(WEBP_USE_MIPS_DSP_R2)
+        __asm__ volatile ("replv.qb %0, %0" : "+r"(pattern));
+#else
+        pattern = 0x01010101u * pattern;
+#endif
+        break;
+      case 2:
+        memcpy(&pattern, src, sizeof(uint16_t));
+#if defined(__arm__) || defined(_M_ARM)
+        pattern |= pattern << 16;
+#elif defined(WEBP_USE_MIPS_DSP_R2)
+        __asm__ volatile ("replv.ph %0, %0" : "+r"(pattern));
+#else
+        pattern = 0x00010001u * pattern;
+#endif
+        break;
+      case 4:
+        memcpy(&pattern, src, sizeof(uint32_t));
+        break;
+      default:
+        goto Copy;
+        break;
+    }
+    CopySmallPattern8b(src, dst, length, pattern);
+    return;
+  }
+ Copy:
+  if (dist >= length) {  // no overlap -> use memcpy()
+    memcpy(dst, src, length * sizeof(*dst));
+  } else {
+    int i;
+    for (i = 0; i < length; ++i) dst[i] = src[i];
+  }
+}
+
+// copy pattern of 1 or 2 uint32_t's
+static WEBP_INLINE void CopySmallPattern32b(const uint32_t* src,
+                                            uint32_t* dst,
+                                            int length, uint64_t pattern) {
+  int i;
+  if ((uintptr_t)dst & 4) {           // Align 'dst' to 8-bytes boundary.
+    *dst++ = *src++;
+    pattern = (pattern >> 32) | (pattern << 32);
+    --length;
+  }
+  assert(0 == ((uintptr_t)dst & 7));
+  for (i = 0; i < (length >> 1); ++i) {
+    ((uint64_t*)dst)[i] = pattern;    // Copy the pattern 8 bytes at a time.
+  }
+  if (length & 1) {                   // Finish with left-over.
+    dst[i << 1] = src[i << 1];
+  }
+}
+
+static WEBP_INLINE void CopyBlock32b(uint32_t* const dst,
+                                     int dist, int length) {
+  const uint32_t* const src = dst - dist;
+  if (dist <= 2 && length >= 4 && ((uintptr_t)dst & 3) == 0) {
+    uint64_t pattern;
+    if (dist == 1) {
+      pattern = (uint64_t)src[0];
+      pattern |= pattern << 32;
+    } else {
+      memcpy(&pattern, src, sizeof(pattern));
+    }
+    CopySmallPattern32b(src, dst, length, pattern);
+  } else if (dist >= length) {  // no overlap
+    memcpy(dst, src, length * sizeof(*dst));
+  } else {
+    int i;
+    for (i = 0; i < length; ++i) dst[i] = src[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
+                           int width, int height, int last_row) {
+  int ok = 1;
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  const HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  int pos = dec->last_pixel_;         // current position
+  const int end = width * height;     // End of data
+  const int last = width * last_row;  // Last pixel to decode
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int mask = hdr->huffman_mask_;
+  assert(htree_group != NULL);
+  assert(pos < end);
+  assert(last_row <= height);
+  assert(Is8bOptimizable(hdr));
+
+  while (!br->eos_ && pos < last) {
+    int code;
+    // Only update when changing tile.
+    if ((col & mask) == 0) {
+      htree_group = GetHtreeGroupForPos(hdr, col, row);
+    }
+    VP8LFillBitWindow(br);
+    code = ReadSymbol(htree_group->htrees[GREEN], br);
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      data[pos] = code;
+      ++pos;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+    } else if (code < len_code_limit) {  // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(htree_group->htrees[DIST], br);
+      VP8LFillBitWindow(br);
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (pos >= dist && end - pos >= length) {
+        CopyBlock8b(data + pos, dist, length);
+      } else {
+        ok = 0;
+        goto End;
+      }
+      pos += length;
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+      if (pos < last && (col & mask)) {
+        htree_group = GetHtreeGroupForPos(hdr, col, row);
+      }
+    } else {  // Not reached
+      ok = 0;
+      goto End;
+    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
+  }
+  // Process the remaining rows corresponding to last row-block.
+  ExtractPalettedAlphaRows(dec, row);
+
+ End:
+  if (!ok || (br->eos_ && pos < end)) {
+    ok = 0;
+    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
+                            : VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    dec->last_pixel_ = (int)pos;
+  }
+  return ok;
+}
+
+static void SaveState(VP8LDecoder* const dec, int last_pixel) {
+  assert(dec->incremental_);
+  dec->saved_br_ = dec->br_;
+  dec->saved_last_pixel_ = last_pixel;
+  if (dec->hdr_.color_cache_size_ > 0) {
+    VP8LColorCacheCopy(&dec->hdr_.color_cache_, &dec->hdr_.saved_color_cache_);
+  }
+}
+
+static void RestoreState(VP8LDecoder* const dec) {
+  assert(dec->br_.eos_);
+  dec->status_ = VP8_STATUS_SUSPENDED;
+  dec->br_ = dec->saved_br_;
+  dec->last_pixel_ = dec->saved_last_pixel_;
+  if (dec->hdr_.color_cache_size_ > 0) {
+    VP8LColorCacheCopy(&dec->hdr_.saved_color_cache_, &dec->hdr_.color_cache_);
+  }
+}
+
+#define SYNC_EVERY_N_ROWS 8  // minimum number of rows between check-points
+static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
+                           int width, int height, int last_row,
+                           ProcessRowsFunc process_func) {
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  uint32_t* src = data + dec->last_pixel_;
+  uint32_t* last_cached = src;
+  uint32_t* const src_end = data + width * height;     // End of data
+  uint32_t* const src_last = data + width * last_row;  // Last pixel to decode
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
+  int next_sync_row = dec->incremental_ ? row : 1 << 24;
+  VP8LColorCache* const color_cache =
+      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
+  const int mask = hdr->huffman_mask_;
+  assert(htree_group != NULL);
+  assert(src < src_end);
+  assert(src_last <= src_end);
+
+  while (src < src_last) {
+    int code;
+    if (row >= next_sync_row) {
+      SaveState(dec, (int)(src - data));
+      next_sync_row = row + SYNC_EVERY_N_ROWS;
+    }
+    // Only update when changing tile. Note we could use this test:
+    // if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
+    // but that's actually slower and needs storing the previous col/row.
+    if ((col & mask) == 0) {
+      htree_group = GetHtreeGroupForPos(hdr, col, row);
+    }
+    VP8LFillBitWindow(br);
+    code = ReadSymbol(htree_group->htrees[GREEN], br);
+    if (br->eos_) break;  // early out
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      if (htree_group->is_trivial_literal) {
+        *src = htree_group->literal_arb | (code << 8);
+      } else {
+        int red, blue, alpha;
+        red = ReadSymbol(htree_group->htrees[RED], br);
+        VP8LFillBitWindow(br);
+        blue = ReadSymbol(htree_group->htrees[BLUE], br);
+        alpha = ReadSymbol(htree_group->htrees[ALPHA], br);
+        if (br->eos_) break;
+        *src = ((uint32_t)alpha << 24) | (red << 16) | (code << 8) | blue;
+      }
+    AdvanceByOne:
+      ++src;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+          process_func(dec, row);
+        }
+        if (color_cache != NULL) {
+          while (last_cached < src) {
+            VP8LColorCacheInsert(color_cache, *last_cached++);
+          }
+        }
+      }
+    } else if (code < len_code_limit) {  // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(htree_group->htrees[DIST], br);
+      VP8LFillBitWindow(br);
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (br->eos_) break;
+      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
+        goto Error;
+      } else {
+        CopyBlock32b(src, dist, length);
+      }
+      src += length;
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+          process_func(dec, row);
+        }
+      }
+      // Because of the check done above (before 'src' was incremented by
+      // 'length'), the following holds true.
+      assert(src <= src_end);
+      if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
+      if (color_cache != NULL) {
+        while (last_cached < src) {
+          VP8LColorCacheInsert(color_cache, *last_cached++);
+        }
+      }
+    } else if (code < color_cache_limit) {  // Color cache
+      const int key = code - len_code_limit;
+      assert(color_cache != NULL);
+      while (last_cached < src) {
+        VP8LColorCacheInsert(color_cache, *last_cached++);
+      }
+      *src = VP8LColorCacheLookup(color_cache, key);
+      goto AdvanceByOne;
+    } else {  // Not reached
+      goto Error;
+    }
+    assert(br->eos_ == VP8LIsEndOfStream(br));
+  }
+
+  if (dec->incremental_ && br->eos_ && src < src_end) {
+    RestoreState(dec);
+  } else if (!br->eos_) {
+    // Process the remaining rows corresponding to last row-block.
+    if (process_func != NULL) {
+      process_func(dec, row);
+    }
+    dec->status_ = VP8_STATUS_OK;
+    dec->last_pixel_ = (int)(src - data);  // end-of-scan marker
+  } else {
+    // if not incremental, and we are past the end of buffer (eos_=1), then this
+    // is a real bitstream error.
+    goto Error;
+  }
+  return 1;
+
+ Error:
+  dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+  return 0;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LTransform
+
+static void ClearTransform(VP8LTransform* const transform) {
+  free(transform->data_);
+  transform->data_ = NULL;
+}
+
+// For security reason, we need to remap the color map to span
+// the total possible bundled values, and not just the num_colors.
+static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
+  int i;
+  const int final_num_colors = 1 << (8 >> transform->bits_);
+  uint32_t* const new_color_map = (uint32_t*)malloc((uint64_t)final_num_colors * sizeof(*new_color_map));
+  if (new_color_map == NULL) {
+    return 0;
+  } else {
+    uint8_t* const data = (uint8_t*)transform->data_;
+    uint8_t* const new_data = (uint8_t*)new_color_map;
+    new_color_map[0] = transform->data_[0];
+    for (i = 4; i < 4 * num_colors; ++i) {
+      // Equivalent to AddPixelEq(), on a byte-basis.
+      new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
+    }
+    for (; i < 4 * final_num_colors; ++i)
+      new_data[i] = 0;  // black tail.
+    free(transform->data_);
+    transform->data_ = new_color_map;
+  }
+  return 1;
+}
+
+static int ReadTransform(int* const xsize, int const* ysize,
+                         VP8LDecoder* const dec) {
+  int ok = 1;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LTransform* transform = &dec->transforms_[dec->next_transform_];
+  const VP8LImageTransformType type =
+      (VP8LImageTransformType)VP8LReadBits(br, 2);
+
+  // Each transform type can only be present once in the stream.
+  if (dec->transforms_seen_ & (1U << type)) {
+    return 0;  // Already there, let's not accept the second same transform.
+  }
+  dec->transforms_seen_ |= (1U << type);
+
+  transform->type_ = type;
+  transform->xsize_ = *xsize;
+  transform->ysize_ = *ysize;
+  transform->data_ = NULL;
+  ++dec->next_transform_;
+  assert(dec->next_transform_ <= NUM_TRANSFORMS);
+
+  switch (type) {
+    case PREDICTOR_TRANSFORM:
+    case CROSS_COLOR_TRANSFORM:
+      transform->bits_ = VP8LReadBits(br, 3) + 2;
+      ok = DecodeImageStream(VP8LSubSampleSize(transform->xsize_,
+                                               transform->bits_),
+                             VP8LSubSampleSize(transform->ysize_,
+                                               transform->bits_),
+                             0, dec, &transform->data_);
+      break;
+    case COLOR_INDEXING_TRANSFORM: {
+       const int num_colors = VP8LReadBits(br, 8) + 1;
+       const int bits = (num_colors > 16) ? 0
+                      : (num_colors > 4) ? 1
+                      : (num_colors > 2) ? 2
+                      : 3;
+       *xsize = VP8LSubSampleSize(transform->xsize_, bits);
+       transform->bits_ = bits;
+       ok = DecodeImageStream(num_colors, 1, 0, dec, &transform->data_);
+       ok = ok && ExpandColorMap(num_colors, transform);
+      break;
+    }
+    case SUBTRACT_GREEN:
+      break;
+    default:
+      assert(0);    // can't happen
+      break;
+  }
+
+  return ok;
+}
+
+// -----------------------------------------------------------------------------
+// VP8LMetadata
+
+static void InitMetadata(VP8LMetadata* const hdr) {
+  assert(hdr != NULL);
+  memset(hdr, 0, sizeof(*hdr));
+}
+
+static void ClearMetadata(VP8LMetadata* const hdr) {
+  assert(hdr != NULL);
+
+  free(hdr->huffman_image_);
+  free(hdr->huffman_tables_);
+  VP8LHtreeGroupsFree(hdr->htree_groups_);
+  VP8LColorCacheClear(&hdr->color_cache_);
+  VP8LColorCacheClear(&hdr->saved_color_cache_);
+  InitMetadata(hdr);
+}
+
+// -----------------------------------------------------------------------------
+// VP8LDecoder
+
+VP8LDecoder* VP8LNew(void) {
+  VP8LDecoder* const dec = (VP8LDecoder*)calloc(1ULL, sizeof(*dec));
+  if (dec == NULL) return NULL;
+  dec->status_ = VP8_STATUS_OK;
+  dec->state_ = READ_DIM;
+
+  VP8LDspInit();  // Init critical function pointers.
+
+  return dec;
+}
+
+void VP8LClear(VP8LDecoder* const dec) {
+  int i;
+  if (dec == NULL) return;
+  ClearMetadata(&dec->hdr_);
+
+  free(dec->pixels_);
+  dec->pixels_ = NULL;
+  for (i = 0; i < dec->next_transform_; ++i) {
+    ClearTransform(&dec->transforms_[i]);
+  }
+  dec->next_transform_ = 0;
+  dec->transforms_seen_ = 0;
+
+  free(dec->rescaler_memory);
+  dec->rescaler_memory = NULL;
+
+  dec->output_ = NULL;   // leave no trace behind
+}
+
+void VP8LDelete(VP8LDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LClear(dec);
+    free(dec);
+  }
+}
+
+static void UpdateDecoder(VP8LDecoder* const dec, int width, int height) {
+  VP8LMetadata* const hdr = &dec->hdr_;
+  const int num_bits = hdr->huffman_subsample_bits_;
+  dec->width_ = width;
+  dec->height_ = height;
+
+  hdr->huffman_xsize_ = VP8LSubSampleSize(width, num_bits);
+  hdr->huffman_mask_ = (num_bits == 0) ? ~0 : (1 << num_bits) - 1;
+}
+
+static int DecodeImageStream(int xsize, int ysize,
+                             int is_level0,
+                             VP8LDecoder* const dec,
+                             uint32_t** const decoded_data) {
+  int ok = 1;
+  int transform_xsize = xsize;
+  int transform_ysize = ysize;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  uint32_t* data = NULL;
+  int color_cache_bits = 0;
+
+  // Read the transforms (may recurse).
+  if (is_level0) {
+    while (ok && VP8LReadBits(br, 1)) {
+      ok = ReadTransform(&transform_xsize, &transform_ysize, dec);
+    }
+  }
+
+  // Color cache
+  if (ok && VP8LReadBits(br, 1)) {
+    color_cache_bits = VP8LReadBits(br, 4);
+    ok = (color_cache_bits >= 1 && color_cache_bits <= MAX_CACHE_BITS);
+    if (!ok) {
+      dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+      goto End;
+    }
+  }
+
+  // Read the Huffman codes (may recurse).
+  ok = ok && ReadHuffmanCodes(dec, transform_xsize, transform_ysize,
+                              color_cache_bits, is_level0);
+  if (!ok) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    goto End;
+  }
+
+  // Finish setting up the color-cache
+  if (color_cache_bits > 0) {
+    hdr->color_cache_size_ = 1 << color_cache_bits;
+    if (!VP8LColorCacheInit(&hdr->color_cache_, color_cache_bits)) {
+      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+      ok = 0;
+      goto End;
+    }
+  } else {
+    hdr->color_cache_size_ = 0;
+  }
+  UpdateDecoder(dec, transform_xsize, transform_ysize);
+
+  if (is_level0) {   // level 0 complete
+    dec->state_ = READ_HDR;
+    goto End;
+  }
+
+  {
+    const uint64_t total_size = (uint64_t)transform_xsize * transform_ysize;
+    data = (uint32_t*)malloc(total_size * sizeof(*data));
+    if (data == NULL) {
+      dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+      ok = 0;
+      goto End;
+    }
+  }
+
+  // Use the Huffman trees to decode the LZ77 encoded data.
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize,
+                       transform_ysize, NULL);
+  ok = ok && !br->eos_;
+
+ End:
+  if (!ok) {
+    free(data);
+    ClearMetadata(hdr);
+  } else {
+    if (decoded_data != NULL) {
+      *decoded_data = data;
+    } else {
+      // We allocate image data in this function only for transforms. At level 0
+      // (that is: not the transforms), we shouldn't have allocated anything.
+      assert(data == NULL);
+      assert(is_level0);
+    }
+    dec->last_pixel_ = 0;  // Reset for future DECODE_DATA_FUNC() calls.
+    if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
+  }
+  return ok;
+}
+
+//------------------------------------------------------------------------------
+// Allocate internal buffers dec->pixels_ and dec->argb_cache_.
+static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
+  const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
+  // Scratch buffer corresponding to top-prediction row for transforming the
+  // first row in the row-blocks. Not needed for paletted alpha.
+  const uint64_t cache_top_pixels = (uint16_t)final_width;
+  // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
+  const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
+  const uint64_t total_num_pixels =
+      num_pixels + cache_top_pixels + cache_pixels;
+
+  assert(dec->width_ <= final_width);
+  dec->pixels_ = (uint32_t*)malloc(total_num_pixels * sizeof(uint32_t));
+  if (dec->pixels_ == NULL) {
+    dec->argb_cache_ = NULL;    // for sanity check
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
+  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
+  return 1;
+}
+
+static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
+  const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
+  dec->argb_cache_ = NULL;    // for sanity check
+  dec->pixels_ = (uint32_t*)malloc(total_num_pixels * sizeof(uint8_t));
+  if (dec->pixels_ == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+// Special row-processing that only stores the alpha data.
+static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint32_t* const in = dec->pixels_ + dec->width_ * dec->last_row_;
+
+  if (num_rows <= 0) return;  // Nothing to be done.
+  ApplyInverseTransforms(dec, num_rows, in);
+
+  // Extract alpha (which is stored in the green plane).
+  {
+    const int width = dec->io_->width;      // the final width (!= dec->width_)
+    const int cache_pixs = width * num_rows;
+    uint8_t* const dst = (uint8_t*)dec->io_->opaque + width * dec->last_row_;
+    const uint32_t* const src = dec->argb_cache_;
+    int i;
+    for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
+  }
+  dec->last_row_ = dec->last_out_row_ = row;
+}
+
+int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output) {
+  int ok = 0;
+  VP8LDecoder* dec;
+  VP8Io* io;
+  assert(alph_dec != NULL);
+  alph_dec->vp8l_dec_ = VP8LNew();
+  if (alph_dec->vp8l_dec_ == NULL) return 0;
+  dec = alph_dec->vp8l_dec_;
+
+  dec->width_ = alph_dec->width_;
+  dec->height_ = alph_dec->height_;
+  dec->io_ = &alph_dec->io_;
+  io = dec->io_;
+
+  VP8InitIo(io);
+  WebPInitCustomIo(NULL, io);  // Just a sanity Init. io won't be used.
+  io->opaque = output;
+  io->width = alph_dec->width_;
+  io->height = alph_dec->height_;
+
+  dec->status_ = VP8_STATUS_OK;
+  VP8LInitBitReader(&dec->br_, data, data_size);
+
+  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
+    goto Err;
+  }
+
+  // Special case: if alpha data uses only the color indexing transform and
+  // doesn't use color cache (a frequent case), we will use DecodeAlphaData()
+  // method that only needs allocation of 1 byte per pixel (alpha channel).
+  if (dec->next_transform_ == 1 &&
+      dec->transforms_[0].type_ == COLOR_INDEXING_TRANSFORM &&
+      Is8bOptimizable(&dec->hdr_)) {
+    alph_dec->use_8b_decode = 1;
+    ok = AllocateInternalBuffers8b(dec);
+  } else {
+    // Allocate internal buffers (note that dec->width_ may have changed here).
+    alph_dec->use_8b_decode = 0;
+    ok = AllocateInternalBuffers32b(dec, alph_dec->width_);
+  }
+
+  if (!ok) goto Err;
+
+  return 1;
+
+ Err:
+  VP8LDelete(alph_dec->vp8l_dec_);
+  alph_dec->vp8l_dec_ = NULL;
+  return 0;
+}
+
+int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
+  VP8LDecoder* const dec = alph_dec->vp8l_dec_;
+  assert(dec != NULL);
+  assert(last_row <= dec->height_);
+
+  if (dec->last_pixel_ == dec->width_ * dec->height_) {
+    return 1;  // done
+  }
+
+  // Decode (with special row processing).
+  return alph_dec->use_8b_decode ?
+      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
+                      last_row) :
+      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
+                      last_row, ExtractAlphaRows);
+}
+
+//------------------------------------------------------------------------------
+
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
+  int width, height, has_alpha;
+
+  if (dec == NULL) return 0;
+  if (io == NULL) {
+    dec->status_ = VP8_STATUS_INVALID_PARAM;
+    return 0;
+  }
+
+  dec->io_ = io;
+  dec->status_ = VP8_STATUS_OK;
+  VP8LInitBitReader(&dec->br_, io->data, io->data_size);
+  if (!ReadImageInfo(&dec->br_, &width, &height, &has_alpha)) {
+    dec->status_ = VP8_STATUS_BITSTREAM_ERROR;
+    goto Error;
+  }
+  dec->state_ = READ_DIM;
+  io->width = width;
+  io->height = height;
+
+  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Error;
+  return 1;
+
+ Error:
+  VP8LClear(dec);
+  assert(dec->status_ != VP8_STATUS_OK);
+  return 0;
+}
+
+int VP8LDecodeImage(VP8LDecoder* const dec) {
+  VP8Io* io = NULL;
+  WebPDecParams* params = NULL;
+
+  // Sanity checks.
+  if (dec == NULL) return 0;
+
+  assert(dec->hdr_.huffman_tables_ != NULL);
+  assert(dec->hdr_.htree_groups_ != NULL);
+  assert(dec->hdr_.num_htree_groups_ > 0);
+
+  io = dec->io_;
+  assert(io != NULL);
+  params = (WebPDecParams*)io->opaque;
+  assert(params != NULL);
+
+  // Initialization.
+  if (dec->state_ != READ_DATA) {
+    dec->output_ = params->output;
+    assert(dec->output_ != NULL);
+
+    if (!WebPIoInitFromOptions(params->options, io, MODE_BGRA)) {
+      dec->status_ = VP8_STATUS_INVALID_PARAM;
+      goto Err;
+    }
+
+    if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
+
+    if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
+
+    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+      // need the alpha-multiply functions for premultiplied output or rescaling
+      WebPInitAlphaProcessing();
+    }
+    if (dec->incremental_) {
+      if (dec->hdr_.color_cache_size_ > 0 &&
+          dec->hdr_.saved_color_cache_.colors_ == NULL) {
+        if (!VP8LColorCacheInit(&dec->hdr_.saved_color_cache_,
+                                dec->hdr_.color_cache_.hash_bits_)) {
+          dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+          goto Err;
+        }
+      }
+    }
+    dec->state_ = READ_DATA;
+  }
+
+  // Decode.
+  if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
+                       dec->height_, ProcessRows)) {
+    goto Err;
+  }
+
+  params->last_y = dec->last_out_row_;
+  return 1;
+
+ Err:
+  VP8LClear(dec);
+  assert(dec->status_ != VP8_STATUS_OK);
+  return 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/dec/vp8li.h b/src/loaders/webp/dec/vp8li.h
new file mode 100644
index 00000000..8886e47f
--- /dev/null
+++ b/src/loaders/webp/dec/vp8li.h
@@ -0,0 +1,136 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Lossless decoder: internal header.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora(vikaas.arora@gmail.com)
+
+#ifndef WEBP_DEC_VP8LI_H_
+#define WEBP_DEC_VP8LI_H_
+
+#include <string.h>     // for memcpy()
+#include "./webpi.h"
+#include "../utils/bit_reader.h"
+#include "../utils/color_cache.h"
+#include "../utils/huffman.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  READ_DATA = 0,
+  READ_HDR = 1,
+  READ_DIM = 2
+} VP8LDecodeState;
+
+typedef struct VP8LTransform VP8LTransform;
+struct VP8LTransform {
+  VP8LImageTransformType type_;   // transform type.
+  int                    bits_;   // subsampling bits defining transform window.
+  int                    xsize_;  // transform window X index.
+  int                    ysize_;  // transform window Y index.
+  uint32_t              *data_;   // transform data.
+};
+
+typedef struct {
+  int             color_cache_size_;
+  VP8LColorCache  color_cache_;
+  VP8LColorCache  saved_color_cache_;  // for incremental
+
+  int             huffman_mask_;
+  int             huffman_subsample_bits_;
+  int             huffman_xsize_;
+  uint32_t       *huffman_image_;
+  int             num_htree_groups_;
+  HTreeGroup     *htree_groups_;
+  HuffmanCode    *huffman_tables_;
+} VP8LMetadata;
+
+typedef struct VP8LDecoder VP8LDecoder;
+struct VP8LDecoder {
+  VP8StatusCode    status_;
+  VP8LDecodeState  state_;
+  VP8Io           *io_;
+
+  const WebPDecBuffer *output_;    // shortcut to io->opaque->output
+
+  uint32_t        *pixels_;        // Internal data: either uint8_t* for alpha
+                                   // or uint32_t* for BGRA.
+  uint32_t        *argb_cache_;    // Scratch buffer for temporary BGRA storage.
+
+  VP8LBitReader    br_;
+  int              incremental_;   // if true, incremental decoding is expected
+  VP8LBitReader    saved_br_;      // note: could be local variables too
+  int              saved_last_pixel_;
+
+  int              width_;
+  int              height_;
+  int              last_row_;      // last input row decoded so far.
+  int              last_pixel_;    // last pixel decoded so far. However, it may
+                                   // not be transformed, scaled and
+                                   // color-converted yet.
+  int              last_out_row_;  // last row output so far.
+
+  VP8LMetadata     hdr_;
+
+  int              next_transform_;
+  VP8LTransform    transforms_[NUM_TRANSFORMS];
+  // or'd bitset storing the transforms types.
+  uint32_t         transforms_seen_;
+
+  uint8_t         *rescaler_memory;  // Working memory for rescaling work.
+  WebPRescaler    *rescaler;         // Common rescaler for all channels.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+struct ALPHDecoder;  // Defined in dec/alphai.h.
+
+// in vp8l.c
+
+// Decodes image header for alpha data stored using lossless compression.
+// Returns false in case of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output);
+
+// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
+// already decoded in previous call(s), it will resume decoding from where it
+// was paused.
+// Returns false in case of bitstream error.
+int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
+                               int last_row);
+
+// Allocates and initialize a new lossless decoder instance.
+VP8LDecoder* VP8LNew(void);
+
+// Decodes the image header. Returns false in case of error.
+int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
+
+// Decodes an image. It's required to decode the lossless header before calling
+// this function. Returns false in case of error, with updated dec->status_.
+int VP8LDecodeImage(VP8LDecoder* const dec);
+
+// Resets the decoder in its initial state, reclaiming memory.
+// Preserves the dec->status_ value.
+void VP8LClear(VP8LDecoder* const dec);
+
+// Clears and deallocate a lossless decoder instance.
+void VP8LDelete(VP8LDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_VP8LI_H_ */
diff --git a/src/loaders/webp/dec/webp.cpp b/src/loaders/webp/dec/webp.cpp
new file mode 100644
index 00000000..7f6a5a44
--- /dev/null
+++ b/src/loaders/webp/dec/webp.cpp
@@ -0,0 +1,674 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Main decoding functions for WEBP images.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <stdlib.h>
+
+#include "./vp8i.h"
+#include "./vp8li.h"
+#include "./webpi.h"
+
+
+// VP8X Feature Flags.
+typedef enum WebPFeatureFlags {
+  FRAGMENTS_FLAG  = 0x00000001,
+  ANIMATION_FLAG  = 0x00000002,
+  XMP_FLAG        = 0x00000004,
+  EXIF_FLAG       = 0x00000008,
+  ALPHA_FLAG      = 0x00000010,
+  ICCP_FLAG       = 0x00000020
+} WebPFeatureFlags;
+
+
+//------------------------------------------------------------------------------
+// RIFF layout is:
+//   Offset  tag
+//   0...3   "RIFF" 4-byte tag
+//   4...7   size of image data (including metadata) starting at offset 8
+//   8...11  "WEBP"   our form-type signature
+// The RIFF container (12 bytes) is followed by appropriate chunks:
+//   12..15  "VP8 ": 4-bytes tags, signaling the use of VP8 video format
+//   16..19  size of the raw VP8 image data, starting at offset 20
+//   20....  the VP8 bytes
+// Or,
+//   12..15  "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
+//   16..19  size of the raw VP8L image data, starting at offset 20
+//   20....  the VP8L bytes
+// Or,
+//   12..15  "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
+//   16..19  size of the VP8X chunk starting at offset 20.
+//   20..23  VP8X flags bit-map corresponding to the chunk-types present.
+//   24..26  Width of the Canvas Image.
+//   27..29  Height of the Canvas Image.
+// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
+// VP8L, XMP, EXIF  ...)
+// All sizes are in little-endian order.
+// Note: chunk data size must be padded to multiple of 2 when written.
+
+static WEBP_INLINE uint32_t get_le24(const uint8_t* const data) {
+  return data[0] | (data[1] << 8) | (data[2] << 16);
+}
+
+static WEBP_INLINE uint32_t get_le32(const uint8_t* const data) {
+  return (uint32_t)get_le24(data) | (data[3] << 24);
+}
+
+// Validates the RIFF container (if detected) and skips over it.
+// If a RIFF container is detected, returns:
+//     VP8_STATUS_BITSTREAM_ERROR for invalid header,
+//     VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
+// and VP8_STATUS_OK otherwise.
+// In case there are not enough bytes (partial RIFF container), return 0 for
+// *riff_size. Else return the RIFF size extracted from the header.
+static VP8StatusCode ParseRIFF(const uint8_t** const data,
+                               size_t* const data_size, int have_all_data,
+                               size_t* const riff_size) {
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(riff_size != NULL);
+
+  *riff_size = 0;  // Default: no RIFF present.
+  if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
+    if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong image file signature.
+    } else {
+      const uint32_t size = get_le32(*data + TAG_SIZE);
+      // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
+      if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      if (size > MAX_CHUNK_PAYLOAD) {
+        return VP8_STATUS_BITSTREAM_ERROR;
+      }
+      if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+        return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+      }
+      // We have a RIFF container. Skip it.
+      *riff_size = size;
+      *data += RIFF_HEADER_SIZE;
+      *data_size -= RIFF_HEADER_SIZE;
+    }
+  }
+  return VP8_STATUS_OK;
+}
+
+// Validates the VP8X header and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
+// *height_ptr and *flags_ptr are set to the corresponding values extracted
+// from the VP8X chunk.
+static VP8StatusCode ParseVP8X(const uint8_t** const data,
+                               size_t* const data_size,
+                               int* const found_vp8x,
+                               int* const width_ptr, int* const height_ptr,
+                               uint32_t* const flags_ptr) {
+  const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(found_vp8x != NULL);
+
+  *found_vp8x = 0;
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (!memcmp(*data, "VP8X", TAG_SIZE)) {
+    int width, height;
+    uint32_t flags;
+    const uint32_t chunk_size = get_le32(*data + TAG_SIZE);
+    if (chunk_size != VP8X_CHUNK_SIZE) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Wrong chunk size.
+    }
+
+    // Verify if enough data is available to validate the VP8X chunk.
+    if (*data_size < vp8x_size) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+    }
+    flags = get_le32(*data + 8);
+    width = 1 + get_le24(*data + 12);
+    height = 1 + get_le24(*data + 15);
+    if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // image is too large
+    }
+
+    if (flags_ptr != NULL) *flags_ptr = flags;
+    if (width_ptr != NULL) *width_ptr = width;
+    if (height_ptr != NULL) *height_ptr = height;
+    // Skip over VP8X header bytes.
+    *data += vp8x_size;
+    *data_size -= vp8x_size;
+    *found_vp8x = 1;
+  }
+  return VP8_STATUS_OK;
+}
+
+// Skips to the next VP8/VP8L chunk header in the data given the size of the
+// RIFF chunk 'riff_size'.
+// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If an alpha chunk is found, *alpha_data and *alpha_size are set
+// appropriately.
+static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
+                                         size_t* const data_size,
+                                         size_t const riff_size,
+                                         const uint8_t** const alpha_data,
+                                         size_t* const alpha_size) {
+  const uint8_t* buf;
+  size_t buf_size;
+  uint32_t total_size = TAG_SIZE +           // "WEBP".
+                        CHUNK_HEADER_SIZE +  // "VP8Xnnnn".
+                        VP8X_CHUNK_SIZE;     // data.
+  assert(data != NULL);
+  assert(data_size != NULL);
+  buf = *data;
+  buf_size = *data_size;
+
+  assert(alpha_data != NULL);
+  assert(alpha_size != NULL);
+  *alpha_data = NULL;
+  *alpha_size = 0;
+
+  while (1) {
+    uint32_t chunk_size;
+    uint32_t disk_chunk_size;   // chunk_size with padding
+
+    *data = buf;
+    *data_size = buf_size;
+
+    if (buf_size < CHUNK_HEADER_SIZE) {  // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    chunk_size = get_le32(buf + TAG_SIZE);
+    if (chunk_size > MAX_CHUNK_PAYLOAD) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+    // For odd-sized chunk-payload, there's one byte padding at the end.
+    disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
+    total_size += disk_chunk_size;
+
+    // Check that total bytes skipped so far does not exceed riff_size.
+    if (riff_size > 0 && (total_size > riff_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;          // Not a valid chunk size.
+    }
+
+    // Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
+    // parsed all the optional chunks.
+    // Note: This check must occur before the check 'buf_size < disk_chunk_size'
+    // below to allow incomplete VP8/VP8L chunks.
+    if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
+        !memcmp(buf, "VP8L", TAG_SIZE)) {
+      return VP8_STATUS_OK;
+    }
+
+    if (buf_size < disk_chunk_size) {             // Insufficient data.
+      return VP8_STATUS_NOT_ENOUGH_DATA;
+    }
+
+    if (!memcmp(buf, "ALPH", TAG_SIZE)) {         // A valid ALPH header.
+      *alpha_data = buf + CHUNK_HEADER_SIZE;
+      *alpha_size = chunk_size;
+    }
+
+    // We have a full and valid chunk; skip it.
+    buf += disk_chunk_size;
+    buf_size -= disk_chunk_size;
+  }
+}
+
+// Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
+// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
+//         riff_size) VP8/VP8L header,
+//         VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
+//         VP8_STATUS_OK otherwise.
+// If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
+// extracted from the VP8/VP8L chunk header.
+// The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
+static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
+                                    size_t* const data_size, int have_all_data,
+                                    size_t riff_size, size_t* const chunk_size,
+                                    int* const is_lossless) {
+  const uint8_t* const data = *data_ptr;
+  const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
+  const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
+  const uint32_t minimal_size =
+      TAG_SIZE + CHUNK_HEADER_SIZE;  // "WEBP" + "VP8 nnnn" OR
+                                     // "WEBP" + "VP8Lnnnn"
+  assert(data != NULL);
+  assert(data_size != NULL);
+  assert(chunk_size != NULL);
+  assert(is_lossless != NULL);
+
+  if (*data_size < CHUNK_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;  // Insufficient data.
+  }
+
+  if (is_vp8 || is_vp8l) {
+    // Bitstream contains VP8/VP8L header.
+    const uint32_t size = get_le32(data + TAG_SIZE);
+    if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
+      return VP8_STATUS_BITSTREAM_ERROR;  // Inconsistent size information.
+    }
+    if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
+      return VP8_STATUS_NOT_ENOUGH_DATA;  // Truncated bitstream.
+    }
+    // Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
+    *chunk_size = size;
+    *data_ptr += CHUNK_HEADER_SIZE;
+    *data_size -= CHUNK_HEADER_SIZE;
+    *is_lossless = is_vp8l;
+  } else {
+    // Raw VP8/VP8L bitstream (no header).
+    *is_lossless = VP8LCheckSignature(data, *data_size);
+    *chunk_size = *data_size;
+  }
+
+  return VP8_STATUS_OK;
+}
+
+//------------------------------------------------------------------------------
+
+// Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
+// 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
+// minimal amount will be read to fetch the remaining parameters.
+// If 'headers' is non-NULL this function will attempt to locate both alpha
+// data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
+// Note: The following chunk sequences (before the raw VP8/VP8L data) are
+// considered valid by this function:
+// RIFF + VP8(L)
+// RIFF + VP8X + (optional chunks) + VP8(L)
+// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
+// VP8(L)     <-- Not a valid WebP format: only allowed for internal purpose.
+static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
+                                          size_t data_size,
+                                          int* const width,
+                                          int* const height,
+                                          int* const has_alpha,
+                                          int* const has_animation,
+                                          int* const format,
+                                          WebPHeaderStructure* const headers) {
+  int canvas_width = 0;
+  int canvas_height = 0;
+  int image_width = 0;
+  int image_height = 0;
+  int found_riff = 0;
+  int found_vp8x = 0;
+  int animation_present = 0;
+  int fragments_present = 0;
+  const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
+
+  VP8StatusCode status;
+  WebPHeaderStructure hdrs;
+
+  if (data == NULL || data_size < RIFF_HEADER_SIZE) {
+    return VP8_STATUS_NOT_ENOUGH_DATA;
+  }
+  memset(&hdrs, 0, sizeof(hdrs));
+  hdrs.data = data;
+  hdrs.data_size = data_size;
+
+  // Skip over RIFF header.
+  status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
+  if (status != VP8_STATUS_OK) {
+    return status;   // Wrong RIFF header / insufficient data.
+  }
+  found_riff = (hdrs.riff_size > 0);
+
+  // Skip over VP8X.
+  {
+    uint32_t flags = 0;
+    status = ParseVP8X(&data, &data_size, &found_vp8x,
+                       &canvas_width, &canvas_height, &flags);
+    if (status != VP8_STATUS_OK) {
+      return status;  // Wrong VP8X / insufficient data.
+    }
+    animation_present = !!(flags & ANIMATION_FLAG);
+    fragments_present = !!(flags & FRAGMENTS_FLAG);
+    if (!found_riff && found_vp8x) {
+      // Note: This restriction may be removed in the future, if it becomes
+      // necessary to send VP8X chunk to the decoder.
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+    if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
+    if (has_animation != NULL) *has_animation = animation_present;
+    if (format != NULL) *format = 0;   // default = undefined
+
+    image_width = canvas_width;
+    image_height = canvas_height;
+    if (found_vp8x && (animation_present || fragments_present) &&
+        headers == NULL) {
+      status = VP8_STATUS_OK;
+      goto ReturnWidthHeight;  // Just return features from VP8X header.
+    }
+  }
+
+  if (data_size < TAG_SIZE) {
+    status = VP8_STATUS_NOT_ENOUGH_DATA;
+    goto ReturnWidthHeight;
+  }
+
+  // Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
+  if ((found_riff && found_vp8x) ||
+      (!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
+    status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
+                                 &hdrs.alpha_data, &hdrs.alpha_data_size);
+    if (status != VP8_STATUS_OK) {
+      goto ReturnWidthHeight;  // Invalid chunk size / insufficient data.
+    }
+  }
+
+  // Skip over VP8/VP8L header.
+  status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
+                          &hdrs.compressed_size, &hdrs.is_lossless);
+  if (status != VP8_STATUS_OK) {
+    goto ReturnWidthHeight;  // Wrong VP8/VP8L chunk-header / insufficient data.
+  }
+  if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
+    return VP8_STATUS_BITSTREAM_ERROR;
+  }
+
+  if (format != NULL && !(animation_present || fragments_present)) {
+    *format = hdrs.is_lossless ? 2 : 1;
+  }
+
+  if (!hdrs.is_lossless) {
+    if (data_size < VP8_FRAME_HEADER_SIZE) {
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
+    }
+    // Validates raw VP8 data.
+    if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
+                    &image_width, &image_height)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  } else {
+    if (data_size < VP8L_FRAME_HEADER_SIZE) {
+      status = VP8_STATUS_NOT_ENOUGH_DATA;
+      goto ReturnWidthHeight;
+    }
+    // Validates raw VP8L data.
+    if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  }
+  // Validates image size coherency.
+  if (found_vp8x) {
+    if (canvas_width != image_width || canvas_height != image_height) {
+      return VP8_STATUS_BITSTREAM_ERROR;
+    }
+  }
+  if (headers != NULL) {
+    *headers = hdrs;
+    headers->offset = data - headers->data;
+    assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
+    assert(headers->offset == headers->data_size - data_size);
+  }
+ ReturnWidthHeight:
+  if (status == VP8_STATUS_OK ||
+      (status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
+    if (has_alpha != NULL) {
+      // If the data did not contain a VP8X/VP8L chunk the only definitive way
+      // to set this is by looking for alpha data (from an ALPH chunk).
+      *has_alpha |= (hdrs.alpha_data != NULL);
+    }
+    if (width != NULL) *width = image_width;
+    if (height != NULL) *height = image_height;
+    return VP8_STATUS_OK;
+  } else {
+    return status;
+  }
+}
+
+// Skips over all valid chunks prior to the first VP8/VP8L frame header.
+// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
+// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
+// in the case of non-decodable features (animation for instance).
+// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
+// fields are updated appropriately upon success.
+static VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
+  VP8StatusCode status;
+  int has_animation = 0;
+  assert(headers != NULL);
+  // fill out headers, ignore width/height/has_alpha.
+  status = ParseHeadersInternal(headers->data, headers->data_size,
+                                NULL, NULL, NULL, &has_animation,
+                                NULL, headers);
+  if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
+    // TODO(jzern): full support of animation frames will require API additions.
+    if (has_animation) {
+      status = VP8_STATUS_UNSUPPORTED_FEATURE;
+    }
+  }
+  return status;
+}
+
+//------------------------------------------------------------------------------
+// "Into" decoding variants
+
+// Main flow
+static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
+                                WebPDecParams* const params) {
+  VP8StatusCode status;
+  VP8Io io;
+  WebPHeaderStructure headers;
+
+  headers.data = data;
+  headers.data_size = data_size;
+  headers.have_all_data = 1;
+  status = WebPParseHeaders(&headers);   // Process Pre-VP8 chunks.
+  if (status != VP8_STATUS_OK) {
+    return status;
+  }
+
+  assert(params != NULL);
+  VP8InitIo(&io);
+  io.data = headers.data + headers.offset;
+  io.data_size = headers.data_size - headers.offset;
+  WebPInitCustomIo(params, &io);  // Plug the I/O functions.
+
+  if (!headers.is_lossless) {
+    VP8Decoder* const dec = VP8New();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    dec->alpha_data_ = headers.alpha_data;
+    dec->alpha_data_size_ = headers.alpha_data_size;
+
+    // Decode bitstream header, update io->width/io->height.
+    if (!VP8GetHeaders(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        VP8InitDithering(params->options, dec);
+        if (!VP8Decode(dec, &io)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8Delete(dec);
+  } else {
+    VP8LDecoder* const dec = VP8LNew();
+    if (dec == NULL) {
+      return VP8_STATUS_OUT_OF_MEMORY;
+    }
+    if (!VP8LDecodeHeader(dec, &io)) {
+      status = dec->status_;   // An error occurred. Grab error status.
+    } else {
+      // Allocate/check output buffers.
+      status = WebPAllocateDecBuffer(io.width, io.height, params->options,
+                                     params->output);
+      if (status == VP8_STATUS_OK) {  // Decode
+        if (!VP8LDecodeImage(dec)) {
+          status = dec->status_;
+        }
+      }
+    }
+    VP8LDelete(dec);
+  }
+
+  if (status != VP8_STATUS_OK) {
+    WebPFreeDecBuffer(params->output);
+  }
+
+  if (params->options != NULL && params->options->flip) {
+    status = WebPFlipBuffer(params->output);
+  }
+  return status;
+}
+
+//------------------------------------------------------------------------------
+
+static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
+                       size_t data_size, int* const width, int* const height,
+                       WebPDecBuffer* const keep_info) {
+  WebPDecParams params;
+  WebPDecBuffer output;
+
+  WebPInitDecBuffer(&output);
+  memset(&params, 0, sizeof(params));
+  params.output = &output;
+  output.colorspace = mode;
+
+  // Retrieve (and report back) the required dimensions from bitstream.
+  if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
+    return NULL;
+  }
+  if (width != NULL) *width = output.width;
+  if (height != NULL) *height = output.height;
+
+  // Decode
+  if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
+    return NULL;
+  }
+  if (keep_info != NULL) {    // keep track of the side-info
+    WebPCopyDecBuffer(&output, keep_info);
+  }
+  // return decoded samples (don't clear 'output'!)
+  return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
+}
+
+
+static void DefaultFeatures(WebPBitstreamFeatures* const features) {
+  assert(features != NULL);
+  memset(features, 0, sizeof(*features));
+}
+
+static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
+                                 WebPBitstreamFeatures* const features) {
+  if (features == NULL || data == NULL) {
+    return VP8_STATUS_INVALID_PARAM;
+  }
+  DefaultFeatures(features);
+
+  // Only parse enough of the data to retrieve the features.
+  return ParseHeadersInternal(data, data_size,
+                              &features->width, &features->height,
+                              &features->has_alpha, &features->has_animation,
+                              &features->format, NULL);
+}
+
+//------------------------------------------------------------------------------
+// Cropping and rescaling.
+
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
+  const int W = io->width;
+  const int H = io->height;
+  int x = 0, y = 0, w = W, h = H;
+
+  // Cropping
+  io->use_cropping = (options != NULL) && (options->use_cropping > 0);
+  if (io->use_cropping) {
+    w = options->crop_width;
+    h = options->crop_height;
+    x = options->crop_left;
+    y = options->crop_top;
+    if (!WebPIsRGBMode(src_colorspace)) {   // only snap for YUV420
+      x &= ~1;
+      y &= ~1;
+    }
+    if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
+      return 0;  // out of frame boundary error
+    }
+  }
+  io->crop_left   = x;
+  io->crop_top    = y;
+  io->crop_right  = x + w;
+  io->crop_bottom = y + h;
+  io->mb_w = w;
+  io->mb_h = h;
+
+  // Scaling
+  io->use_scaling = (options != NULL) && (options->use_scaling > 0);
+  if (io->use_scaling) {
+    if (options->scaled_width <= 0 || options->scaled_height <= 0) {
+      return 0;
+    }
+    io->scaled_width = options->scaled_width;
+    io->scaled_height = options->scaled_height;
+  }
+
+  // Filter
+  io->bypass_filtering = options && options->bypass_filtering;
+
+  // Fancy upsampler
+#ifdef FANCY_UPSAMPLING
+  io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
+#endif
+
+  if (io->use_scaling) {
+    // disable filter (only for large downscaling ratio).
+    io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
+                           (io->scaled_height < H * 3 / 4);
+    io->fancy_upsampling = 0;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+
+uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                        int* width, int* height) {
+  return Decode(MODE_BGRA, data, data_size, width, height, NULL);
+}
+
+
+int WebPGetInfo(const uint8_t* data, size_t data_size,
+                int* width, int* height) {
+  WebPBitstreamFeatures features;
+
+  if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) {
+    return 0;
+  }
+
+  if (width != NULL) {
+    *width  = features.width;
+  }
+  if (height != NULL) {
+    *height = features.height;
+  }
+
+  return 1;
+}
diff --git a/src/loaders/webp/dec/webpi.h b/src/loaders/webp/dec/webpi.h
new file mode 100644
index 00000000..aed81cb5
--- /dev/null
+++ b/src/loaders/webp/dec/webpi.h
@@ -0,0 +1,108 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Internal header: WebP decoding parameters and custom IO on buffer
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#ifndef WEBP_DEC_WEBPI_H_
+#define WEBP_DEC_WEBPI_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../utils/rescaler.h"
+#include "./decode_vp8.h"
+
+//------------------------------------------------------------------------------
+// WebPDecParams: Decoding output parameters. Transient internal object.
+
+typedef struct WebPDecParams WebPDecParams;
+typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
+typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos);
+
+struct WebPDecParams {
+  WebPDecBuffer* output;             // output buffer.
+  uint8_t* tmp_y, *tmp_u, *tmp_v;    // cache for the fancy upsampler
+                                     // or used for tmp rescaling
+
+  int last_y;                 // coordinate of the line that was last output
+  const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
+  // rescalers
+  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+  void* memory;                  // overall scratch memory for the output work.
+
+  OutputFunc emit;               // output RGB or YUV samples
+  OutputFunc emit_alpha;         // output alpha channel
+  OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
+};
+
+
+//------------------------------------------------------------------------------
+// Header parsing helpers
+
+// Structure storing a description of the RIFF headers.
+typedef struct {
+  const uint8_t* data;         // input buffer
+  size_t data_size;            // input buffer size
+  int have_all_data;           // true if all data is known to be available
+  size_t offset;               // offset to main data chunk (VP8 or VP8L)
+  const uint8_t* alpha_data;   // points to alpha chunk (if present)
+  size_t alpha_data_size;      // alpha chunk size
+  size_t compressed_size;      // VP8/VP8L compressed data size
+  size_t riff_size;            // size of the riff payload (or 0 if absent)
+  int is_lossless;             // true if a VP8L chunk is present
+} WebPHeaderStructure;
+
+
+//------------------------------------------------------------------------------
+// Misc utils
+
+// Initializes VP8Io with custom setup, io and teardown functions. The default
+// hooks will use the supplied 'params' as io->opaque handle.
+void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
+
+// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
+// to the *compressed* format, not the output one.
+int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
+                          VP8Io* const io, WEBP_CSP_MODE src_colorspace);
+
+//------------------------------------------------------------------------------
+// Internal functions regarding WebPDecBuffer memory (in buffer.c).
+// Don't really need to be externally visible for now.
+
+// Prepare 'buffer' with the requested initial dimensions width/height.
+// If no external storage is supplied, initializes buffer by allocating output
+// memory and setting up the stride information. Validate the parameters. Return
+// an error code in case of problem (no memory, or invalid stride / size /
+// dimension / etc.). If *options is not NULL, also verify that the options'
+// parameters are valid and apply them to the width/height dimensions of the
+// output buffer. This takes cropping / scaling / rotation into account.
+// Also incorporates the options->flip flag to flip the buffer parameters if
+// needed.
+VP8StatusCode WebPAllocateDecBuffer(int width, int height,
+                                    const WebPDecoderOptions* const options,
+                                    WebPDecBuffer* const buffer);
+
+// Flip buffer vertically by negating the various strides.
+VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
+
+// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
+// memory (still held by 'src').
+void WebPCopyDecBuffer(const WebPDecBuffer* const src,
+                       WebPDecBuffer* const dst);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_WEBPI_H_ */
diff --git a/src/loaders/webp/dsp/alpha_processing.cpp b/src/loaders/webp/dsp/alpha_processing.cpp
new file mode 100644
index 00000000..bef13f41
--- /dev/null
+++ b/src/loaders/webp/dsp/alpha_processing.cpp
@@ -0,0 +1,377 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include "./dsp.h"
+
+// Tables can be faster on some platform but incur some extra binary size (~2k).
+// #define USE_TABLES_FOR_ALPHA_MULT
+
+// -----------------------------------------------------------------------------
+
+#define MFIX 24    // 24bit fixed-point arithmetic
+#define HALF ((1u << MFIX) >> 1)
+#define KINV_255 ((1u << MFIX) / 255u)
+
+static uint32_t Mult(uint8_t x, uint32_t mult) {
+  const uint32_t v = (x * mult + HALF) >> MFIX;
+  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
+  return v;
+}
+
+#ifdef USE_TABLES_FOR_ALPHA_MULT
+
+static const uint32_t kMultTables[2][256] = {
+  {    // (255u << MFIX) / alpha
+    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
+    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
+    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
+    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
+    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
+    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
+    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
+    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
+    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
+    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
+    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
+    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
+    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
+    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
+    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
+    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
+    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
+    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
+    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
+    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
+    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
+    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
+    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
+    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
+    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
+    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
+    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
+    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
+    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
+    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
+    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
+    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
+    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
+    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
+    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
+    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
+    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
+    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
+    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
+    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
+    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
+    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
+    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
+  {   // alpha * KINV_255
+    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
+    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
+    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
+    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
+    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
+    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
+    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
+    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
+    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
+    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
+    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
+    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
+    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
+    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
+    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
+    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
+    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
+    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
+    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
+    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
+    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
+    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
+    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
+    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
+    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
+    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
+    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
+    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
+    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
+    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
+    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
+    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
+    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
+    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
+    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
+    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
+    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
+    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
+    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
+    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
+    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
+    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
+    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
+};
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return kMultTables[!inverse][a];
+}
+
+#else
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return inverse ? (255u << MFIX) / a : a * KINV_255;
+}
+
+#endif    // USE_TABLES_FOR_ALPHA_MULT
+
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        const uint32_t alpha = (argb >> 24) & 0xff;
+        const uint32_t scale = GetScale(alpha, inverse);
+        uint32_t out = argb & 0xff000000u;
+        out |= Mult(argb >>  0, scale) <<  0;
+        out |= Mult(argb >>  8, scale) <<  8;
+        out |= Mult(argb >> 16, scale) << 16;
+        ptr[x] = out;
+      }
+    }
+  }
+}
+
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t a = alpha[x];
+    if (a != 255) {
+      if (a == 0) {
+        ptr[x] = 0;
+      } else {
+        const uint32_t scale = GetScale(a, inverse);
+        ptr[x] = Mult(ptr[x], scale);
+      }
+    }
+  }
+}
+
+#undef KINV_255
+#undef HALF
+#undef MFIX
+
+void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse);
+
+//------------------------------------------------------------------------------
+// Generic per-plane calls
+
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
+    ptr += stride;
+  }
+}
+
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultRow(ptr, alpha, width, inverse);
+    ptr += stride;
+    alpha += alpha_stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Premultiplied modes
+
+// non dithered-modes
+
+// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
+// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
+// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
+#if 1     // (int)(x * a / 255.)
+#define MULTIPLIER(a)   ((a) * 32897U)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+#else     // (int)(x * a / 255. + .5)
+#define MULTIPLIER(a) ((a) * 65793U)
+#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
+#endif
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  while (h-- > 0) {
+    uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+    const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// rgbA4444
+
+#define MULTIPLIER(a)  ((a) * 0x1111)    // 0x1111 ~= (1 << 16) / 15
+
+static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
+  return (x & 0xf0) | (x >> 4);
+}
+
+static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
+  return (x & 0x0f) | (x << 4);
+}
+
+static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
+  return (x * m) >> 16;
+}
+
+static WEBP_INLINE void ApplyAlphaMultiply4444(uint8_t* rgba4444,
+                                               int w, int h, int stride,
+                                               int rg_byte_pos /* 0 or 1 */) {
+  while (h-- > 0) {
+    int i;
+    for (i = 0; i < w; ++i) {
+      const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
+      const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
+      const uint8_t a = ba & 0x0f;
+      const uint32_t mult = MULTIPLIER(a);
+      const uint8_t r = multiply(dither_hi(rg), mult);
+      const uint8_t g = multiply(dither_lo(rg), mult);
+      const uint8_t b = multiply(dither_hi(ba), mult);
+      rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
+      rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
+    }
+    rgba4444 += stride;
+  }
+}
+#undef MULTIPLIER
+
+static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
+                                   int w, int h, int stride) {
+#ifdef WEBP_SWAP_16BIT_CSP
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 1);
+#else
+  ApplyAlphaMultiply4444(rgba4444, w, h, stride, 0);
+#endif
+}
+
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  return (alpha_mask != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      dst[i] = alpha[i] << 8;  // leave A/R/B channels zero'd.
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  uint8_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint8_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  return (alpha_mask == 0xff);
+}
+
+void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
+void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
+int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+
+//------------------------------------------------------------------------------
+// Init function
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+extern void WebPInitAlphaProcessingSSE2(void);
+
+static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
+    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
+  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPMultARGBRow = WebPMultARGBRowC;
+  WebPMultRow = WebPMultRowC;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
+  WebPExtractAlpha = ExtractAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitAlphaProcessingSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitAlphaProcessingMIPSdspR2();
+    }
+#endif
+  }
+  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/loaders/webp/dsp/argb.cpp b/src/loaders/webp/dsp/argb.cpp
new file mode 100644
index 00000000..cc1f9a96
--- /dev/null
+++ b/src/loaders/webp/dsp/argb.cpp
@@ -0,0 +1,68 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions.
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
+
+void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                    const uint8_t*, int, uint32_t*);
+void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                   int, int, uint32_t*);
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+extern void VP8EncDspARGBInitSSE2(void);
+
+static volatile VP8CPUInfo argb_last_cpuinfo_used =
+    (VP8CPUInfo)&argb_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
+  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspARGBInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspARGBInitMIPSdspR2();
+    }
+#endif
+  }
+  argb_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/loaders/webp/dsp/cpu.cpp b/src/loaders/webp/dsp/cpu.cpp
new file mode 100644
index 00000000..202d3121
--- /dev/null
+++ b/src/loaders/webp/dsp/cpu.cpp
@@ -0,0 +1,120 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// CPU detection
+//
+// Author: Christian Duvivier (cduvivier@google.com)
+
+#include "./dsp.h"
+
+
+//------------------------------------------------------------------------------
+// SSE2 detection.
+//
+
+// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
+#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "mov %%ebx, %%edi\n"
+    "cpuid\n"
+    "xchg %%edi, %%ebx\n"
+    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "cpuid\n"
+    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#include <intrin.h>
+#define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
+#elif defined(WEBP_MSC_SSE2)
+#define GetCPUInfo __cpuid
+#endif
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static WEBP_INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static WEBP_INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+static int x86CPUInfo(CPUFeature feature) {
+  int cpu_info[4];
+  GetCPUInfo(cpu_info, 1);
+  if (feature == kSSE2) {
+    return 0 != (cpu_info[3] & 0x04000000);
+  }
+ 
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
+static int AndroidCPUInfo(CPUFeature feature) {
+  const AndroidCpuFamily cpu_family = android_getCpuFamily();
+  const uint64_t cpu_features = android_getCpuFeatures();
+  if (feature == kNEON) {
+    return (cpu_family == ANDROID_CPU_FAMILY_ARM &&
+            0 != (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON));
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(WEBP_USE_NEON)
+// define a dummy function to enable turning off NEON at runtime by setting
+// VP8DecGetCPUInfo = NULL
+static int armCPUInfo(CPUFeature feature) {
+  (void)feature;
+  return 1;
+}
+VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
+static int mipsCPUInfo(CPUFeature feature) {
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
+}
+VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
+#else
+VP8CPUInfo VP8GetCPUInfo = NULL;
+#endif
+
diff --git a/src/loaders/webp/dsp/dec.cpp b/src/loaders/webp/dsp/dec.cpp
new file mode 100644
index 00000000..cefbdba0
--- /dev/null
+++ b/src/loaders/webp/dsp/dec.cpp
@@ -0,0 +1,766 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Speed-critical decoding functions, default plain-C implementations.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "../dec/vp8i.h"
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+}
+
+//------------------------------------------------------------------------------
+// Transforms (Paragraph 14.4)
+
+#define STORE(x, y, v) \
+  dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
+
+#define STORE2(y, dc, d, c) do {    \
+  const int DC = (dc);              \
+  STORE(0, y, DC + (d));            \
+  STORE(1, y, DC + (c));            \
+  STORE(2, y, DC - (c));            \
+  STORE(3, y, DC - (d));            \
+} while (0)
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int C[4 * 4], *tmp;
+  int i;
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // vertical pass
+    const int a = in[0] + in[8];    // [-4096, 4094]
+    const int b = in[0] - in[8];    // [-4095, 4095]
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    tmp[0] = a + d;   // [-7881, 7875]
+    tmp[1] = b + c;   // [-7878, 7878]
+    tmp[2] = b - c;   // [-7878, 7878]
+    tmp[3] = a - d;   // [-7877, 7879]
+    tmp += 4;
+    in++;
+  }
+  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
+  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // After the second pass, maximum interval is [-3794, 3794], assuming
+  // an input in [-2048, 2047] interval. We then need to add a dst value
+  // in the [0, 255] range.
+  // In the worst case scenario, the input to clip_8b() can be as large as
+  // [-60713, 60968].
+  tmp = C;
+  for (i = 0; i < 4; ++i) {    // horizontal pass
+    const int dc = tmp[0] + 4;
+    const int a =  dc +  tmp[8];
+    const int b =  dc -  tmp[8];
+    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
+    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    STORE(0, 0, a + d);
+    STORE(1, 0, b + c);
+    STORE(2, 0, b - c);
+    STORE(3, 0, a - d);
+    tmp++;
+    dst += BPS;
+  }
+}
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  STORE2(0, a + d4, d1, c1);
+  STORE2(1, a + c4, d1, c1);
+  STORE2(2, a - c4, d1, c1);
+  STORE2(3, a - d4, d1, c1);
+}
+#undef MUL
+#undef STORE2
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformUV(const int16_t* in, uint8_t* dst) {
+  VP8Transform(in + 0 * 16, dst, 1);
+  VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = in[0] + 4;
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 4; ++i) {
+      STORE(i, j, DC);
+    }
+  }
+}
+
+static void TransformDCUV(const int16_t* in, uint8_t* dst) {
+  if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
+  if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
+  if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
+  if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
+}
+
+#undef STORE
+
+//------------------------------------------------------------------------------
+// Paragraph 14.3
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  int tmp[16];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const int a0 = in[0 + i] + in[12 + i];
+    const int a1 = in[4 + i] + in[ 8 + i];
+    const int a2 = in[4 + i] - in[ 8 + i];
+    const int a3 = in[0 + i] - in[12 + i];
+    tmp[0  + i] = a0 + a1;
+    tmp[8  + i] = a0 - a1;
+    tmp[4  + i] = a3 + a2;
+    tmp[12 + i] = a3 - a2;
+  }
+  for (i = 0; i < 4; ++i) {
+    const int dc = tmp[0 + i * 4] + 3;    // w/ rounder
+    const int a0 = dc             + tmp[3 + i * 4];
+    const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
+    const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
+    const int a3 = dc             - tmp[3 + i * 4];
+    out[ 0] = (a0 + a1) >> 3;
+    out[16] = (a3 + a2) >> 3;
+    out[32] = (a0 - a1) >> 3;
+    out[48] = (a3 - a2) >> 3;
+    out += 64;
+  }
+}
+
+void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const uint8_t* const clip0 = VP8kclip1 - top[-1];
+  int y;
+  for (y = 0; y < size; ++y) {
+    const uint8_t* const clip = clip0 + dst[-1];
+    int x;
+    for (x = 0; x < size; ++x) {
+      dst[x] = clip[top[x]];
+    }
+    dst += BPS;
+  }
+}
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t* dst) {     // vertical
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 16);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    memset(dst, dst[-1], 16);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(int v, uint8_t* dst) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    memset(dst + j * BPS, v, 16);
+  }
+}
+
+static void DC16(uint8_t* dst) {    // DC
+  int DC = 16;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS] + dst[j - BPS];
+  }
+  Put16(DC >> 5, dst);
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+  int DC = 8;
+  int i;
+  for (i = 0; i < 16; ++i) {
+    DC += dst[i - BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
+
+//------------------------------------------------------------------------------
+// 4x4
+
+#define AVG3(a, b, c) ((((uint32_t)a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  const uint8_t vals[4] = {
+    (uint8_t)AVG3(top[-1], top[0], top[1]),
+    (uint8_t)AVG3(top[ 0], top[1], top[2]),
+    (uint8_t)AVG3(top[ 1], top[2], top[3]),
+    (uint8_t)AVG3(top[ 2], top[3], top[4])
+  };
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, sizeof(vals));
+  }
+}
+
+static void HE4(uint8_t* dst) {    // horizontal
+  const int A = dst[-1 - BPS];
+  const int B = dst[-1];
+  const int C = dst[-1 + BPS];
+  const int D = dst[-1 + 2 * BPS];
+  const int E = dst[-1 + 3 * BPS];
+  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
+  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
+  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
+  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  DST(0, 3)                                     = AVG3(J, K, L);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  DST(0, 0)                                     = AVG3(A, B, C);
+  DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
+  DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
+  DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
+}
+
+static void VR4(uint8_t* dst) {   // Vertical-Right
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static void VL4(uint8_t* dst) {   // Vertical-Left
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+  const int D = dst[3 - BPS];
+  const int E = dst[4 - BPS];
+  const int F = dst[5 - BPS];
+  const int G = dst[6 - BPS];
+  const int H = dst[7 - BPS];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static void HU4(uint8_t* dst) {   // Horizontal-Up
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+    DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static void HD4(uint8_t* dst) {  // Horizontal-Down
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int L = dst[-1 + 3 * BPS];
+  const int X = dst[-1 - BPS];
+  const int A = dst[0 - BPS];
+  const int B = dst[1 - BPS];
+  const int C = dst[2 - BPS];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+VP8PredFunc VP8PredLuma4[NUM_BMODES];
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memcpy(dst + j * BPS, dst - BPS, 8);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memset(dst, dst[-1], 8);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
+  int j;
+  for (j = 0; j < 8; ++j) {
+    memset(dst + j * BPS, value, 8);
+  }
+}
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int dc0 = 8;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS] + dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 4, dst);
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[i - BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}
+
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
+
+//------------------------------------------------------------------------------
+// Edge filtering functions
+
+// 4 pixels in, 2 pixels out
+static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];  // in [-893,892]
+  const int a1 = VP8ksclip2[(a + 4) >> 3];            // in [-16,15]
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  p[-step] = VP8kclip1[p0 + a2];
+  p[    0] = VP8kclip1[q0 - a1];
+}
+
+// 4 pixels in, 4 pixels out
+static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  const int a = 3 * (q0 - p0);
+  const int a1 = VP8ksclip2[(a + 4) >> 3];
+  const int a2 = VP8ksclip2[(a + 3) >> 3];
+  const int a3 = (a1 + 1) >> 1;
+  p[-2*step] = VP8kclip1[p1 + a3];
+  p[-  step] = VP8kclip1[p0 + a2];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a3];
+}
+
+// 6 pixels in, 6 pixels out
+static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
+  const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
+  const int q0 = p[0], q1 = p[step], q2 = p[2*step];
+  const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
+  const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
+  const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
+  const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
+  p[-3*step] = VP8kclip1[p2 + a3];
+  p[-2*step] = VP8kclip1[p1 + a2];
+  p[-  step] = VP8kclip1[p0 + a1];
+  p[      0] = VP8kclip1[q0 - a1];
+  p[   step] = VP8kclip1[q1 - a2];
+  p[ 2*step] = VP8kclip1[q2 - a3];
+}
+
+static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
+  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
+}
+
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
+  const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
+  return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
+}
+
+static WEBP_INLINE int needs_filter2(const uint8_t* p,
+                                     int step, int t, int it) {
+  const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
+  const int p0 = p[-step], q0 = p[0];
+  const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
+  if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
+  return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
+         VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
+         VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
+}
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i, stride, thresh2)) {
+      do_filter2(p + i, stride);
+    }
+  }
+}
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  for (i = 0; i < 16; ++i) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
+      do_filter2(p + i * stride, 1);
+    }
+  }
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Complex In-loop filtering (Paragraph 15.3)
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter6(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  while (size-- > 0) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
+      if (hev(p, hstride, hev_thresh)) {
+        do_filter2(p, hstride);
+      } else {
+        do_filter4(p, hstride);
+      }
+    }
+    p += vstride;
+  }
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+//------------------------------------------------------------------------------
+
+VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
+VP8DecIdct VP8TransformUV;
+VP8DecIdct VP8TransformDC;
+VP8DecIdct VP8TransformDCUV;
+
+VP8LumaFilterFunc VP8VFilter16;
+VP8LumaFilterFunc VP8HFilter16;
+VP8ChromaFilterFunc VP8VFilter8;
+VP8ChromaFilterFunc VP8HFilter8;
+VP8LumaFilterFunc VP8VFilter16i;
+VP8LumaFilterFunc VP8HFilter16i;
+VP8ChromaFilterFunc VP8VFilter8i;
+VP8ChromaFilterFunc VP8HFilter8i;
+VP8SimpleFilterFunc VP8SimpleVFilter16;
+VP8SimpleFilterFunc VP8SimpleHFilter16;
+VP8SimpleFilterFunc VP8SimpleVFilter16i;
+VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+extern void VP8DspInitSSE2(void);
+extern void VP8DspInitSSE41(void);
+extern void VP8DspInitNEON(void);
+extern void VP8DspInitMIPS32(void);
+extern void VP8DspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo dec_last_cpuinfo_used =
+    (VP8CPUInfo)&dec_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
+  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8InitClipTables();
+
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformUV = TransformUV;
+  VP8TransformDC = TransformDC;
+  VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[8] = HD4;
+  VP8PredLuma4[9] = HU4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8DspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8DspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8DspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8DspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8DspInitMIPSdspR2();
+    }
+#endif
+  }
+  dec_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/loaders/webp/dsp/dec_clip_tables.cpp b/src/loaders/webp/dsp/dec_clip_tables.cpp
new file mode 100644
index 00000000..edfffd0f
--- /dev/null
+++ b/src/loaders/webp/dsp/dec_clip_tables.cpp
@@ -0,0 +1,366 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Clipping tables for filtering
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+//#define USE_STATIC_TABLES     // undefine to have run-time table initialization
+
+#ifdef USE_STATIC_TABLES
+
+static const uint8_t abs0[255 + 255 + 1] = {
+  (uint8_t)0xff, (uint8_t)0xfe, (uint8_t)0xfd, (uint8_t)0xfc, (uint8_t)0xfb, (uint8_t)0xfa, (uint8_t)0xf9, (uint8_t)0xf8, (uint8_t)0xf7, (uint8_t)0xf6, (uint8_t)0xf5, (uint8_t)0xf4,
+  (uint8_t)0xf3, (uint8_t)0xf2, (uint8_t)0xf1, (uint8_t)0xf0, (uint8_t)0xef, (uint8_t)0xee, (uint8_t)0xed, (uint8_t)0xec, (uint8_t)0xeb, (uint8_t)0xea, (uint8_t)0xe9, (uint8_t)0xe8,
+  (uint8_t)0xe7, (uint8_t)0xe6, (uint8_t)0xe5, (uint8_t)0xe4, (uint8_t)0xe3, (uint8_t)0xe2, (uint8_t)0xe1, (uint8_t)0xe0, (uint8_t)0xdf, (uint8_t)0xde, (uint8_t)0xdd, (uint8_t)0xdc,
+  (uint8_t)0xdb, (uint8_t)0xda, (uint8_t)0xd9, (uint8_t)0xd8, (uint8_t)0xd7, (uint8_t)0xd6, (uint8_t)0xd5, (uint8_t)0xd4, (uint8_t)0xd3, (uint8_t)0xd2, (uint8_t)0xd1, (uint8_t)0xd0,
+  (uint8_t)0xcf, (uint8_t)0xce, (uint8_t)0xcd, (uint8_t)0xcc, (uint8_t)0xcb, (uint8_t)0xca, (uint8_t)0xc9, (uint8_t)0xc8, (uint8_t)0xc7, (uint8_t)0xc6, (uint8_t)0xc5, (uint8_t)0xc4,
+  (uint8_t)0xc3, (uint8_t)0xc2, (uint8_t)0xc1, (uint8_t)0xc0, (uint8_t)0xbf, (uint8_t)0xbe, (uint8_t)0xbd, (uint8_t)0xbc, (uint8_t)0xbb, (uint8_t)0xba, (uint8_t)0xb9, (uint8_t)0xb8,
+  (uint8_t)0xb7, (uint8_t)0xb6, (uint8_t)0xb5, (uint8_t)0xb4, (uint8_t)0xb3, (uint8_t)0xb2, (uint8_t)0xb1, (uint8_t)0xb0, (uint8_t)0xaf, (uint8_t)0xae, (uint8_t)0xad, (uint8_t)0xac,
+  (uint8_t)0xab, (uint8_t)0xaa, (uint8_t)0xa9, (uint8_t)0xa8, (uint8_t)0xa7, (uint8_t)0xa6, (uint8_t)0xa5, (uint8_t)0xa4, (uint8_t)0xa3, (uint8_t)0xa2, (uint8_t)0xa1, (uint8_t)0xa0,
+  (uint8_t)0x9f, (uint8_t)0x9e, (uint8_t)0x9d, (uint8_t)0x9c, (uint8_t)0x9b, (uint8_t)0x9a, (uint8_t)0x99, (uint8_t)0x98, (uint8_t)0x97, (uint8_t)0x96, (uint8_t)0x95, (uint8_t)0x94,
+  (uint8_t)0x93, (uint8_t)0x92, (uint8_t)0x91, (uint8_t)0x90, (uint8_t)0x8f, (uint8_t)0x8e, (uint8_t)0x8d, (uint8_t)0x8c, (uint8_t)0x8b, (uint8_t)0x8a, (uint8_t)0x89, (uint8_t)0x88,
+  (uint8_t)0x87, (uint8_t)0x86, (uint8_t)0x85, (uint8_t)0x84, (uint8_t)0x83, (uint8_t)0x82, (uint8_t)0x81, (uint8_t)0x80, (uint8_t)0x7f, (uint8_t)0x7e, (uint8_t)0x7d, (uint8_t)0x7c,
+  (uint8_t)0x7b, (uint8_t)0x7a, (uint8_t)0x79, (uint8_t)0x78, (uint8_t)0x77, (uint8_t)0x76, (uint8_t)0x75, (uint8_t)0x74, (uint8_t)0x73, (uint8_t)0x72, (uint8_t)0x71, (uint8_t)0x70,
+  (uint8_t)0x6f, (uint8_t)0x6e, (uint8_t)0x6d, (uint8_t)0x6c, (uint8_t)0x6b, (uint8_t)0x6a, (uint8_t)0x69, (uint8_t)0x68, (uint8_t)0x67, (uint8_t)0x66, (uint8_t)0x65, (uint8_t)0x64,
+  (uint8_t)0x63, (uint8_t)0x62, (uint8_t)0x61, (uint8_t)0x60, (uint8_t)0x5f, (uint8_t)0x5e, (uint8_t)0x5d, (uint8_t)0x5c, (uint8_t)0x5b, (uint8_t)0x5a, (uint8_t)0x59, (uint8_t)0x58,
+  (uint8_t)0x57, (uint8_t)0x56, (uint8_t)0x55, (uint8_t)0x54, (uint8_t)0x53, (uint8_t)0x52, (uint8_t)0x51, (uint8_t)0x50, (uint8_t)0x4f, (uint8_t)0x4e, (uint8_t)0x4d, (uint8_t)0x4c,
+  (uint8_t)0x4b, (uint8_t)0x4a, (uint8_t)0x49, (uint8_t)0x48, (uint8_t)0x47, (uint8_t)0x46, (uint8_t)0x45, (uint8_t)0x44, (uint8_t)0x43, (uint8_t)0x42, (uint8_t)0x41, (uint8_t)0x40,
+  (uint8_t)0x3f, (uint8_t)0x3e, (uint8_t)0x3d, (uint8_t)0x3c, (uint8_t)0x3b, (uint8_t)0x3a, (uint8_t)0x39, (uint8_t)0x38, (uint8_t)0x37, (uint8_t)0x36, (uint8_t)0x35, (uint8_t)0x34,
+  (uint8_t)0x33, (uint8_t)0x32, (uint8_t)0x31, (uint8_t)0x30, (uint8_t)0x2f, (uint8_t)0x2e, (uint8_t)0x2d, (uint8_t)0x2c, (uint8_t)0x2b, (uint8_t)0x2a, (uint8_t)0x29, (uint8_t)0x28,
+  (uint8_t)0x27, (uint8_t)0x26, (uint8_t)0x25, (uint8_t)0x24, (uint8_t)0x23, (uint8_t)0x22, (uint8_t)0x21, (uint8_t)0x20, (uint8_t)0x1f, (uint8_t)0x1e, (uint8_t)0x1d, (uint8_t)0x1c,
+  (uint8_t)0x1b, (uint8_t)0x1a, (uint8_t)0x19, (uint8_t)0x18, (uint8_t)0x17, (uint8_t)0x16, (uint8_t)0x15, (uint8_t)0x14, (uint8_t)0x13, (uint8_t)0x12, (uint8_t)0x11, (uint8_t)0x10,
+  (uint8_t)0x0f, (uint8_t)0x0e, (uint8_t)0x0d, (uint8_t)0x0c, (uint8_t)0x0b, (uint8_t)0x0a, (uint8_t)0x09, (uint8_t)0x08, (uint8_t)0x07, (uint8_t)0x06, (uint8_t)0x05, (uint8_t)0x04,
+  (uint8_t)0x03, (uint8_t)0x02, (uint8_t)0x01, (uint8_t)0x00, (uint8_t)0x01, (uint8_t)0x02, (uint8_t)0x03, (uint8_t)0x04, (uint8_t)0x05, (uint8_t)0x06, (uint8_t)0x07, (uint8_t)0x08,
+  (uint8_t)0x09, (uint8_t)0x0a, (uint8_t)0x0b, (uint8_t)0x0c, (uint8_t)0x0d, (uint8_t)0x0e, (uint8_t)0x0f, (uint8_t)0x10, (uint8_t)0x11, (uint8_t)0x12, (uint8_t)0x13, (uint8_t)0x14,
+  (uint8_t)0x15, (uint8_t)0x16, (uint8_t)0x17, (uint8_t)0x18, (uint8_t)0x19, (uint8_t)0x1a, (uint8_t)0x1b, (uint8_t)0x1c, (uint8_t)0x1d, (uint8_t)0x1e, (uint8_t)0x1f, (uint8_t)0x20,
+  (uint8_t)0x21, (uint8_t)0x22, (uint8_t)0x23, (uint8_t)0x24, (uint8_t)0x25, (uint8_t)0x26, (uint8_t)0x27, (uint8_t)0x28, (uint8_t)0x29, (uint8_t)0x2a, (uint8_t)0x2b, (uint8_t)0x2c,
+  (uint8_t)0x2d, (uint8_t)0x2e, (uint8_t)0x2f, (uint8_t)0x30, (uint8_t)0x31, (uint8_t)0x32, (uint8_t)0x33, (uint8_t)0x34, (uint8_t)0x35, (uint8_t)0x36, (uint8_t)0x37, (uint8_t)0x38,
+  (uint8_t)0x39, (uint8_t)0x3a, (uint8_t)0x3b, (uint8_t)0x3c, (uint8_t)0x3d, (uint8_t)0x3e, (uint8_t)0x3f, (uint8_t)0x40, (uint8_t)0x41, (uint8_t)0x42, (uint8_t)0x43, (uint8_t)0x44,
+  (uint8_t)0x45, (uint8_t)0x46, (uint8_t)0x47, (uint8_t)0x48, (uint8_t)0x49, (uint8_t)0x4a, (uint8_t)0x4b, (uint8_t)0x4c, (uint8_t)0x4d, (uint8_t)0x4e, (uint8_t)0x4f, (uint8_t)0x50,
+  (uint8_t)0x51, (uint8_t)0x52, (uint8_t)0x53, (uint8_t)0x54, (uint8_t)0x55, (uint8_t)0x56, (uint8_t)0x57, (uint8_t)0x58, (uint8_t)0x59, (uint8_t)0x5a, (uint8_t)0x5b, (uint8_t)0x5c,
+  (uint8_t)0x5d, (uint8_t)0x5e, (uint8_t)0x5f, (uint8_t)0x60, (uint8_t)0x61, (uint8_t)0x62, (uint8_t)0x63, (uint8_t)0x64, (uint8_t)0x65, (uint8_t)0x66, (uint8_t)0x67, (uint8_t)0x68,
+  (uint8_t)0x69, (uint8_t)0x6a, (uint8_t)0x6b, (uint8_t)0x6c, (uint8_t)0x6d, (uint8_t)0x6e, (uint8_t)0x6f, (uint8_t)0x70, (uint8_t)0x71, (uint8_t)0x72, (uint8_t)0x73, (uint8_t)0x74,
+  (uint8_t)0x75, (uint8_t)0x76, (uint8_t)0x77, (uint8_t)0x78, (uint8_t)0x79, (uint8_t)0x7a, (uint8_t)0x7b, (uint8_t)0x7c, (uint8_t)0x7d, (uint8_t)0x7e, (uint8_t)0x7f, (uint8_t)0x80,
+  (uint8_t)0x81, (uint8_t)0x82, (uint8_t)0x83, (uint8_t)0x84, (uint8_t)0x85, (uint8_t)0x86, (uint8_t)0x87, (uint8_t)0x88, (uint8_t)0x89, (uint8_t)0x8a, (uint8_t)0x8b, (uint8_t)0x8c,
+  (uint8_t)0x8d, (uint8_t)0x8e, (uint8_t)0x8f, (uint8_t)0x90, (uint8_t)0x91, (uint8_t)0x92, (uint8_t)0x93, (uint8_t)0x94, (uint8_t)0x95, (uint8_t)0x96, (uint8_t)0x97, (uint8_t)0x98,
+  (uint8_t)0x99, (uint8_t)0x9a, (uint8_t)0x9b, (uint8_t)0x9c, (uint8_t)0x9d, (uint8_t)0x9e, (uint8_t)0x9f, (uint8_t)0xa0, (uint8_t)0xa1, (uint8_t)0xa2, (uint8_t)0xa3, (uint8_t)0xa4,
+  (uint8_t)0xa5, (uint8_t)0xa6, (uint8_t)0xa7, (uint8_t)0xa8, (uint8_t)0xa9, (uint8_t)0xaa, (uint8_t)0xab, (uint8_t)0xac, (uint8_t)0xad, (uint8_t)0xae, (uint8_t)0xaf, (uint8_t)0xb0,
+  (uint8_t)0xb1, (uint8_t)0xb2, (uint8_t)0xb3, (uint8_t)0xb4, (uint8_t)0xb5, (uint8_t)0xb6, (uint8_t)0xb7, (uint8_t)0xb8, (uint8_t)0xb9, (uint8_t)0xba, (uint8_t)0xbb, (uint8_t)0xbc,
+  (uint8_t)0xbd, (uint8_t)0xbe, (uint8_t)0xbf, (uint8_t)0xc0, (uint8_t)0xc1, (uint8_t)0xc2, (uint8_t)0xc3, (uint8_t)0xc4, (uint8_t)0xc5, (uint8_t)0xc6, (uint8_t)0xc7, (uint8_t)0xc8,
+  (uint8_t)0xc9, (uint8_t)0xca, (uint8_t)0xcb, (uint8_t)0xcc, (uint8_t)0xcd, (uint8_t)0xce, (uint8_t)0xcf, (uint8_t)0xd0, (uint8_t)0xd1, (uint8_t)0xd2, (uint8_t)0xd3, (uint8_t)0xd4,
+  (uint8_t)0xd5, (uint8_t)0xd6, (uint8_t)0xd7, (uint8_t)0xd8, (uint8_t)0xd9, (uint8_t)0xda, (uint8_t)0xdb, (uint8_t)0xdc, (uint8_t)0xdd, (uint8_t)0xde, (uint8_t)0xdf, (uint8_t)0xe0,
+  (uint8_t)0xe1, (uint8_t)0xe2, (uint8_t)0xe3, (uint8_t)0xe4, (uint8_t)0xe5, (uint8_t)0xe6, (uint8_t)0xe7, (uint8_t)0xe8, (uint8_t)0xe9, (uint8_t)0xea, (uint8_t)0xeb, (uint8_t)0xec,
+  (uint8_t)0xed, (uint8_t)0xee, (uint8_t)0xef, (uint8_t)0xf0, (uint8_t)0xf1, (uint8_t)0xf2, (uint8_t)0xf3, (uint8_t)0xf4, (uint8_t)0xf5, (uint8_t)0xf6, (uint8_t)0xf7, (uint8_t)0xf8,
+  (uint8_t)0xf9, (uint8_t)0xfa, (uint8_t)0xfb, (uint8_t)0xfc, (uint8_t)0xfd, (uint8_t)0xfe, (uint8_t)0xff
+};
+
+static const int8_t sclip1[1020 + 1020 + 1] = {
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80,
+  (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x80, (int8_t)0x81, (int8_t)0x82, (int8_t)0x83, (int8_t)0x84, (int8_t)0x85, (int8_t)0x86, (int8_t)0x87,
+  (int8_t)0x88, (int8_t)0x89, (int8_t)0x8a, (int8_t)0x8b, (int8_t)0x8c, (int8_t)0x8d, (int8_t)0x8e, (int8_t)0x8f, (int8_t)0x90, (int8_t)0x91, (int8_t)0x92, (int8_t)0x93,
+  (int8_t)0x94, (int8_t)0x95, (int8_t)0x96, (int8_t)0x97, (int8_t)0x98, (int8_t)0x99, (int8_t)0x9a, (int8_t)0x9b, (int8_t)0x9c, (int8_t)0x9d, (int8_t)0x9e, (int8_t)0x9f,
+  (int8_t)0xa0, (int8_t)0xa1, (int8_t)0xa2, (int8_t)0xa3, (int8_t)0xa4, (int8_t)0xa5, (int8_t)0xa6, (int8_t)0xa7, (int8_t)0xa8, (int8_t)0xa9, (int8_t)0xaa, (int8_t)0xab,
+  (int8_t)0xac, (int8_t)0xad, (int8_t)0xae, (int8_t)0xaf, (int8_t)0xb0, (int8_t)0xb1, (int8_t)0xb2, (int8_t)0xb3, (int8_t)0xb4, (int8_t)0xb5, (int8_t)0xb6, (int8_t)0xb7,
+  (int8_t)0xb8, (int8_t)0xb9, (int8_t)0xba, (int8_t)0xbb, (int8_t)0xbc, (int8_t)0xbd, (int8_t)0xbe, (int8_t)0xbf, (int8_t)0xc0, (int8_t)0xc1, (int8_t)0xc2, (int8_t)0xc3,
+  (int8_t)0xc4, (int8_t)0xc5, (int8_t)0xc6, (int8_t)0xc7, (int8_t)0xc8, (int8_t)0xc9, (int8_t)0xca, (int8_t)0xcb, (int8_t)0xcc, (int8_t)0xcd, (int8_t)0xce, (int8_t)0xcf,
+  (int8_t)0xd0, (int8_t)0xd1, (int8_t)0xd2, (int8_t)0xd3, (int8_t)0xd4, (int8_t)0xd5, (int8_t)0xd6, (int8_t)0xd7, (int8_t)0xd8, (int8_t)0xd9, (int8_t)0xda, (int8_t)0xdb,
+  (int8_t)0xdc, (int8_t)0xdd, (int8_t)0xde, (int8_t)0xdf, (int8_t)0xe0, (int8_t)0xe1, (int8_t)0xe2, (int8_t)0xe3, (int8_t)0xe4, (int8_t)0xe5, (int8_t)0xe6, (int8_t)0xe7,
+  (int8_t)0xe8, (int8_t)0xe9, (int8_t)0xea, (int8_t)0xeb, (int8_t)0xec, (int8_t)0xed, (int8_t)0xee, (int8_t)0xef, (int8_t)0xf0, (int8_t)0xf1, (int8_t)0xf2, (int8_t)0xf3,
+  (int8_t)0xf4, (int8_t)0xf5, (int8_t)0xf6, (int8_t)0xf7, (int8_t)0xf8, (int8_t)0xf9, (int8_t)0xfa, (int8_t)0xfb, (int8_t)0xfc, (int8_t)0xfd, (int8_t)0xfe, (int8_t)0xff,
+  (int8_t)0x00, (int8_t)0x01, (int8_t)0x02, (int8_t)0x03, (int8_t)0x04, (int8_t)0x05, (int8_t)0x06, (int8_t)0x07, (int8_t)0x08, (int8_t)0x09, (int8_t)0x0a, (int8_t)0x0b,
+  (int8_t)0x0c, (int8_t)0x0d, (int8_t)0x0e, (int8_t)0x0f, (int8_t)0x10, (int8_t)0x11, (int8_t)0x12, (int8_t)0x13, (int8_t)0x14, (int8_t)0x15, (int8_t)0x16, (int8_t)0x17,
+  (int8_t)0x18, (int8_t)0x19, (int8_t)0x1a, (int8_t)0x1b, (int8_t)0x1c, (int8_t)0x1d, (int8_t)0x1e, (int8_t)0x1f, (int8_t)0x20, (int8_t)0x21, (int8_t)0x22, (int8_t)0x23,
+  (int8_t)0x24, (int8_t)0x25, (int8_t)0x26, (int8_t)0x27, (int8_t)0x28, (int8_t)0x29, (int8_t)0x2a, (int8_t)0x2b, (int8_t)0x2c, (int8_t)0x2d, (int8_t)0x2e, (int8_t)0x2f,
+  (int8_t)0x30, (int8_t)0x31, (int8_t)0x32, (int8_t)0x33, (int8_t)0x34, (int8_t)0x35, (int8_t)0x36, (int8_t)0x37, (int8_t)0x38, (int8_t)0x39, (int8_t)0x3a, (int8_t)0x3b,
+  (int8_t)0x3c, (int8_t)0x3d, (int8_t)0x3e, (int8_t)0x3f, (int8_t)0x40, (int8_t)0x41, (int8_t)0x42, (int8_t)0x43, (int8_t)0x44, (int8_t)0x45, (int8_t)0x46, (int8_t)0x47,
+  (int8_t)0x48, (int8_t)0x49, (int8_t)0x4a, (int8_t)0x4b, (int8_t)0x4c, (int8_t)0x4d, (int8_t)0x4e, (int8_t)0x4f, (int8_t)0x50, (int8_t)0x51, (int8_t)0x52, (int8_t)0x53,
+  (int8_t)0x54, (int8_t)0x55, (int8_t)0x56, (int8_t)0x57, (int8_t)0x58, (int8_t)0x59, (int8_t)0x5a, (int8_t)0x5b, (int8_t)0x5c, (int8_t)0x5d, (int8_t)0x5e, (int8_t)0x5f,
+  (int8_t)0x60, (int8_t)0x61, (int8_t)0x62, (int8_t)0x63, (int8_t)0x64, (int8_t)0x65, (int8_t)0x66, (int8_t)0x67, (int8_t)0x68, (int8_t)0x69, (int8_t)0x6a, (int8_t)0x6b,
+  (int8_t)0x6c, (int8_t)0x6d, (int8_t)0x6e, (int8_t)0x6f, (int8_t)0x70, (int8_t)0x71, (int8_t)0x72, (int8_t)0x73, (int8_t)0x74, (int8_t)0x75, (int8_t)0x76, (int8_t)0x77,
+  (int8_t)0x78, (int8_t)0x79, (int8_t)0x7a, (int8_t)0x7b, (int8_t)0x7c, (int8_t)0x7d, (int8_t)0x7e, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f,
+  (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f, (int8_t)0x7f
+};
+
+static const int8_t sclip2[112 + 112 + 1] = {
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0, (int8_t)0xf0,
+  (int8_t)0xf0, (int8_t)0xf1, (int8_t)0xf2, (int8_t)0xf3, (int8_t)0xf4, (int8_t)0xf5, (int8_t)0xf6, (int8_t)0xf7, (int8_t)0xf8, (int8_t)0xf9, (int8_t)0xfa, (int8_t)0xfb,
+  (int8_t)0xfc, (int8_t)0xfd, (int8_t)0xfe, (int8_t)0xff, (int8_t)0x00, (int8_t)0x01, (int8_t)0x02, (int8_t)0x03, (int8_t)0x04, (int8_t)0x05, (int8_t)0x06, (int8_t)0x07,
+  (int8_t)0x08, (int8_t)0x09, (int8_t)0x0a, (int8_t)0x0b, (int8_t)0x0c, (int8_t)0x0d, (int8_t)0x0e, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f,
+  (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f, (int8_t)0x0f
+};
+
+static const uint8_t clip1[255 + 511 + 1] = {
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00,
+  (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x00, (uint8_t)0x01, (uint8_t)0x02, (uint8_t)0x03, (uint8_t)0x04, (uint8_t)0x05, (uint8_t)0x06, (uint8_t)0x07, (uint8_t)0x08,
+  (uint8_t)0x09, (uint8_t)0x0a, (uint8_t)0x0b, (uint8_t)0x0c, (uint8_t)0x0d, (uint8_t)0x0e, (uint8_t)0x0f, (uint8_t)0x10, (uint8_t)0x11, (uint8_t)0x12, (uint8_t)0x13, (uint8_t)0x14,
+  (uint8_t)0x15, (uint8_t)0x16, (uint8_t)0x17, (uint8_t)0x18, (uint8_t)0x19, (uint8_t)0x1a, (uint8_t)0x1b, (uint8_t)0x1c, (uint8_t)0x1d, (uint8_t)0x1e, (uint8_t)0x1f, (uint8_t)0x20,
+  (uint8_t)0x21, (uint8_t)0x22, (uint8_t)0x23, (uint8_t)0x24, (uint8_t)0x25, (uint8_t)0x26, (uint8_t)0x27, (uint8_t)0x28, (uint8_t)0x29, (uint8_t)0x2a, (uint8_t)0x2b, (uint8_t)0x2c,
+  (uint8_t)0x2d, (uint8_t)0x2e, (uint8_t)0x2f, (uint8_t)0x30, (uint8_t)0x31, (uint8_t)0x32, (uint8_t)0x33, (uint8_t)0x34, (uint8_t)0x35, (uint8_t)0x36, (uint8_t)0x37, (uint8_t)0x38,
+  (uint8_t)0x39, (uint8_t)0x3a, (uint8_t)0x3b, (uint8_t)0x3c, (uint8_t)0x3d, (uint8_t)0x3e, (uint8_t)0x3f, (uint8_t)0x40, (uint8_t)0x41, (uint8_t)0x42, (uint8_t)0x43, (uint8_t)0x44,
+  (uint8_t)0x45, (uint8_t)0x46, (uint8_t)0x47, (uint8_t)0x48, (uint8_t)0x49, (uint8_t)0x4a, (uint8_t)0x4b, (uint8_t)0x4c, (uint8_t)0x4d, (uint8_t)0x4e, (uint8_t)0x4f, (uint8_t)0x50,
+  (uint8_t)0x51, (uint8_t)0x52, (uint8_t)0x53, (uint8_t)0x54, (uint8_t)0x55, (uint8_t)0x56, (uint8_t)0x57, (uint8_t)0x58, (uint8_t)0x59, (uint8_t)0x5a, (uint8_t)0x5b, (uint8_t)0x5c,
+  (uint8_t)0x5d, (uint8_t)0x5e, (uint8_t)0x5f, (uint8_t)0x60, (uint8_t)0x61, (uint8_t)0x62, (uint8_t)0x63, (uint8_t)0x64, (uint8_t)0x65, (uint8_t)0x66, (uint8_t)0x67, (uint8_t)0x68,
+  (uint8_t)0x69, (uint8_t)0x6a, (uint8_t)0x6b, (uint8_t)0x6c, (uint8_t)0x6d, (uint8_t)0x6e, (uint8_t)0x6f, (uint8_t)0x70, (uint8_t)0x71, (uint8_t)0x72, (uint8_t)0x73, (uint8_t)0x74,
+  (uint8_t)0x75, (uint8_t)0x76, (uint8_t)0x77, (uint8_t)0x78, (uint8_t)0x79, (uint8_t)0x7a, (uint8_t)0x7b, (uint8_t)0x7c, (uint8_t)0x7d, (uint8_t)0x7e, (uint8_t)0x7f, (uint8_t)0x80,
+  (uint8_t)0x81, (uint8_t)0x82, (uint8_t)0x83, (uint8_t)0x84, (uint8_t)0x85, (uint8_t)0x86, (uint8_t)0x87, (uint8_t)0x88, (uint8_t)0x89, (uint8_t)0x8a, (uint8_t)0x8b, (uint8_t)0x8c,
+  (uint8_t)0x8d, (uint8_t)0x8e, (uint8_t)0x8f, (uint8_t)0x90, (uint8_t)0x91, (uint8_t)0x92, (uint8_t)0x93, (uint8_t)0x94, (uint8_t)0x95, (uint8_t)0x96, (uint8_t)0x97, (uint8_t)0x98,
+  (uint8_t)0x99, (uint8_t)0x9a, (uint8_t)0x9b, (uint8_t)0x9c, (uint8_t)0x9d, (uint8_t)0x9e, (uint8_t)0x9f, (uint8_t)0xa0, (uint8_t)0xa1, (uint8_t)0xa2, (uint8_t)0xa3, (uint8_t)0xa4,
+  (uint8_t)0xa5, (uint8_t)0xa6, (uint8_t)0xa7, (uint8_t)0xa8, (uint8_t)0xa9, (uint8_t)0xaa, (uint8_t)0xab, (uint8_t)0xac, (uint8_t)0xad, (uint8_t)0xae, (uint8_t)0xaf, (uint8_t)0xb0,
+  (uint8_t)0xb1, (uint8_t)0xb2, (uint8_t)0xb3, (uint8_t)0xb4, (uint8_t)0xb5, (uint8_t)0xb6, (uint8_t)0xb7, (uint8_t)0xb8, (uint8_t)0xb9, (uint8_t)0xba, (uint8_t)0xbb, (uint8_t)0xbc,
+  (uint8_t)0xbd, (uint8_t)0xbe, (uint8_t)0xbf, (uint8_t)0xc0, (uint8_t)0xc1, (uint8_t)0xc2, (uint8_t)0xc3, (uint8_t)0xc4, (uint8_t)0xc5, (uint8_t)0xc6, (uint8_t)0xc7, (uint8_t)0xc8,
+  (uint8_t)0xc9, (uint8_t)0xca, (uint8_t)0xcb, (uint8_t)0xcc, (uint8_t)0xcd, (uint8_t)0xce, (uint8_t)0xcf, (uint8_t)0xd0, (uint8_t)0xd1, (uint8_t)0xd2, (uint8_t)0xd3, (uint8_t)0xd4,
+  (uint8_t)0xd5, (uint8_t)0xd6, (uint8_t)0xd7, (uint8_t)0xd8, (uint8_t)0xd9, (uint8_t)0xda, (uint8_t)0xdb, (uint8_t)0xdc, (uint8_t)0xdd, (uint8_t)0xde, (uint8_t)0xdf, (uint8_t)0xe0,
+  (uint8_t)0xe1, (uint8_t)0xe2, (uint8_t)0xe3, (uint8_t)0xe4, (uint8_t)0xe5, (uint8_t)0xe6, (uint8_t)0xe7, (uint8_t)0xe8, (uint8_t)0xe9, (uint8_t)0xea, (uint8_t)0xeb, (uint8_t)0xec,
+  (uint8_t)0xed, (uint8_t)0xee, (uint8_t)0xef, (uint8_t)0xf0, (uint8_t)0xf1, (uint8_t)0xf2, (uint8_t)0xf3, (uint8_t)0xf4, (uint8_t)0xf5, (uint8_t)0xf6, (uint8_t)0xf7, (uint8_t)0xf8,
+  (uint8_t)0xf9, (uint8_t)0xfa, (uint8_t)0xfb, (uint8_t)0xfc, (uint8_t)0xfd, (uint8_t)0xfe, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff,
+  (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff, (uint8_t)0xff
+};
+
+#else
+
+// uninitialized tables
+static uint8_t abs0[255 + 255 + 1];
+static int8_t sclip1[1020 + 1020 + 1];
+static int8_t sclip2[112 + 112 + 1];
+static uint8_t clip1[255 + 511 + 1];
+
+// We declare this variable 'volatile' to prevent instruction reordering
+// and make sure it's set to true _last_ (so as to be thread-safe)
+static volatile int tables_ok = 0;
+
+#endif
+
+const int8_t* const VP8ksclip1 = &sclip1[1020];
+const int8_t* const VP8ksclip2 = &sclip2[112];
+const uint8_t* const VP8kclip1 = &clip1[255];
+const uint8_t* const VP8kabs0 = &abs0[255];
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
+#if !defined(USE_STATIC_TABLES)
+  int i;
+  if (!tables_ok) {
+    for (i = -255; i <= 255; ++i) {
+      abs0[255 + i] = (i < 0) ? -i : i;
+    }
+    for (i = -1020; i <= 1020; ++i) {
+      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
+    }
+    for (i = -112; i <= 112; ++i) {
+      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
+    }
+    for (i = -255; i <= 255 + 255; ++i) {
+      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
+    }
+    tables_ok = 1;
+  }
+#endif    // USE_STATIC_TABLES
+}
diff --git a/src/loaders/webp/dsp/dsp.h b/src/loaders/webp/dsp/dsp.h
new file mode 100644
index 00000000..4446102a
--- /dev/null
+++ b/src/loaders/webp/dsp/dsp.h
@@ -0,0 +1,319 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   Speed-critical functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_DSP_H_
+#define WEBP_DSP_DSP_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BPS 32   // this is the common stride for enc/dec
+
+//------------------------------------------------------------------------------
+// CPU detection
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
+#endif
+
+// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
+// files without intrinsics, allowing the corresponding Init() to be called.
+// Files containing intrinsics will need to be built targeting the instruction
+// set so should succeed on one of the earlier tests.
+#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
+//#define WEBP_USE_SSE2
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
+#endif
+#endif
+
+typedef enum {
+  kSSE2,
+} CPUFeature;
+// returns true if the CPU supports the feature.
+typedef int (*VP8CPUInfo)(CPUFeature feature);
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Init stub generator
+
+// Defines an init function stub to ensure each module exposes a symbol,
+// avoiding a compiler warning.
+#define WEBP_DSP_INIT_STUB(func) \
+  extern void func(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
+// when doing two transforms, coeffs is actually int16_t[2][16].
+typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
+extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
+extern VP8DecIdct VP8TransformUV;
+extern VP8DecIdct VP8TransformDC;
+extern VP8DecIdct VP8TransformDCUV;
+
+// *dst is the destination block, with stride BPS. Boundary samples are
+// assumed accessible when needed.
+typedef void (*VP8PredFunc)(uint8_t* dst);
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+
+// clipping tables (for filtering)
+extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
+extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
+extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
+extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
+// must be called first
+void VP8InitClipTables(void);
+
+// simple filter (only for luma)
+typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
+extern VP8SimpleFilterFunc VP8SimpleVFilter16;
+extern VP8SimpleFilterFunc VP8SimpleHFilter16;
+extern VP8SimpleFilterFunc VP8SimpleVFilter16i;  // filter 3 inner edges
+extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
+
+// regular filter (on both macroblock edges and inner edges)
+typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
+                                  int thresh, int ithresh, int hev_t);
+typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
+                                    int thresh, int ithresh, int hev_t);
+// on outer edge
+extern VP8LumaFilterFunc VP8VFilter16;
+extern VP8LumaFilterFunc VP8HFilter16;
+extern VP8ChromaFilterFunc VP8VFilter8;
+extern VP8ChromaFilterFunc VP8HFilter8;
+
+// on inner edge
+extern VP8LumaFilterFunc VP8VFilter16i;   // filtering 3 inner edges altogether
+extern VP8LumaFilterFunc VP8HFilter16i;
+extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
+extern VP8ChromaFilterFunc VP8HFilter8i;
+
+// must be called before anything using the above
+void VP8DspInit(void);
+
+//------------------------------------------------------------------------------
+// WebP I/O
+
+//#define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
+
+// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
+// bottom_y can be NULL if only one line of output is needed (at top/bottom).
+typedef void (*WebPUpsampleLinePairFunc)(
+    const uint8_t* top_y, const uint8_t* bottom_y,
+    const uint8_t* top_u, const uint8_t* top_v,
+    const uint8_t* cur_u, const uint8_t* cur_v,
+    uint8_t* top_dst, uint8_t* bottom_dst, int len);
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB(A) modes
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+#endif    // FANCY_UPSAMPLING
+
+// Per-row point-sampling methods.
+typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
+                                   const uint8_t* u, const uint8_t* v,
+                                   uint8_t* dst, int len);
+// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             uint8_t* dst, int dst_stride,
+                             int width, int height, WebPSamplerRowFunc func);
+
+// Sampling functions to convert rows of YUV to RGB(A)
+extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
+
+// General function for converting two lines of ARGB or RGBA.
+// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
+// as 0x00, 0x00, 0x00, 0xff (little endian).
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
+
+// YUV444->RGB converters
+typedef void (*WebPYUV444Converter)(const uint8_t* y,
+                                    const uint8_t* u, const uint8_t* v,
+                                    uint8_t* dst, int len);
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+
+// Must be called before using the WebPUpsamplers[] (and for premultiplied
+// colorspaces like rgbA, rgbA4444, etc)
+void WebPInitUpsamplers(void);
+// Must be called before using WebPSamplers[]
+void WebPInitSamplers(void);
+// Must be called before using WebPYUV444Converters[]
+void WebPInitYUV444Converters(void);
+
+//------------------------------------------------------------------------------
+// Rescaler
+
+struct WebPRescaler;
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported.
+extern void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk,
+                                     const uint8_t* const src, int channel);
+
+// Export one row (starting at x_out position) from rescaler.
+extern void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out);
+
+// Plain-C implementation, as fall-back.
+extern void WebPRescalerExportRowC(struct WebPRescaler* const wrk, int x_out);
+
+// Must be called first before using the above.
+void WebPRescalerDspInit(void);
+
+//------------------------------------------------------------------------------
+// Utilities for processing transparent channel.
+
+// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
+// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
+extern void (*WebPApplyAlphaMultiply)(
+    uint8_t* rgba, int alpha_first, int w, int h, int stride);
+
+// Same, buf specifically for RGBA4444 format
+extern void (*WebPApplyAlphaMultiply4444)(
+    uint8_t* rgba4444, int w, int h, int stride);
+
+// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
+// Returns true if alpha[] plane has non-trivial values different from 0xff.
+extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
+                                int width, int height,
+                                uint8_t* dst, int dst_stride);
+
+// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
+// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
+                                        int width, int height,
+                                        uint32_t* dst, int dst_stride);
+
+// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlpha).
+// Returns true if there's only trivial 0xff alpha values.
+extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
+                               int width, int height,
+                               uint8_t* alpha, int alpha_stride);
+
+// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
+// Un-Multiply operation transforms x into x * 255 / A.
+
+// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
+extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
+
+// Same a WebPMultARGBRow(), but for several rows.
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse);
+
+// Same for a row of single values, with side alpha values.
+extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
+                           int width, int inverse);
+
+// Same a WebPMultRow(), but for several 'num_rows' rows.
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse);
+
+// Plain-C versions, used as fallback by some implementations.
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+
+// To be called first before using the above.
+void WebPInitAlphaProcessing(void);
+
+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+                           const uint8_t* g, const uint8_t* b, int len,
+                           uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                          int len, int step, uint32_t* out);
+
+
+//------------------------------------------------------------------------------
+// Filter functions
+
+typedef enum {     // Filter types.
+  WEBP_FILTER_NONE = 0,
+  WEBP_FILTER_HORIZONTAL,
+  WEBP_FILTER_VERTICAL,
+  WEBP_FILTER_GRADIENT,
+  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
+  WEBP_FILTER_BEST,    // meta-types
+  WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+                               int stride, uint8_t* out);
+typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
+                                 int row, int num_rows, uint8_t* data);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
+extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// To be called first before using the above.
+void VP8FiltersInit(void);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_DSP_H_ */
diff --git a/src/loaders/webp/dsp/filters.cpp b/src/loaders/webp/dsp/filters.cpp
new file mode 100644
index 00000000..5c30f2e4
--- /dev/null
+++ b/src/loaders/webp/dsp/filters.cpp
@@ -0,0 +1,240 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author: Urvang (urvang@google.com)
+
+#include "./dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
+                                    uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLine(in, preds, out, width, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    int w;
+    // leftmost pixel: predict from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    for (w = 1; w < width; ++w) {
+      const int pred = GradientPredictor(preds[w - 1],
+                                         preds[w - stride],
+                                         preds[w - stride - 1]);
+      out[w] = in[w] + (inverse ? pred : -pred);
+    }
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+//------------------------------------------------------------------------------
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Init function
+
+WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+extern void VP8FiltersInitMIPSdspR2(void);
+extern void VP8FiltersInitSSE2(void);
+
+static volatile VP8CPUInfo filters_last_cpuinfo_used =
+    (VP8CPUInfo)&filters_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
+  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_NONE] = NULL;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8FiltersInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8FiltersInitMIPSdspR2();
+    }
+#endif
+  }
+  filters_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/loaders/webp/dsp/lossless.cpp b/src/loaders/webp/dsp/lossless.cpp
new file mode 100644
index 00000000..5ca77594
--- /dev/null
+++ b/src/loaders/webp/dsp/lossless.cpp
@@ -0,0 +1,636 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#include "./dsp.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include "../dec/vp8li.h"
+#include "../utils/endian_inl.h"
+#include "./lossless.h"
+#include "./yuv.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+// In-place sum of each component with mod 256.
+static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
+  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
+  return Clip255(a + b - c);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
+                                         (c1 >> 16) & 0xff,
+                                         (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
+                                         (c1 >> 8) & 0xff,
+                                         (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
+  return Clip255(a + (a - b) / 2);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
+  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && LOCAL_GCC_VERSION == 0x409
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
+  const int pb = b - c;
+  const int pa = a - c;
+  return abs(pb) - abs(pa);
+}
+
+#undef LOCAL_INLINE
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  const int pa_minus_pb =
+      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
+      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
+      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
+      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+//------------------------------------------------------------------------------
+// Predictors
+
+static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
+  (void)top;
+  (void)left;
+  return ARGB_BLACK;
+}
+static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
+  (void)top;
+  return left;
+}
+static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[0];
+}
+static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[1];
+}
+static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[-1];
+}
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+//------------------------------------------------------------------------------
+
+// Inverse prediction.
+static void PredictorInverseTransform(const VP8LTransform* const transform,
+                                      int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  if (y_start == 0) {  // First Row follows the L (mode=1) mode.
+    int x;
+    const uint32_t pred0 = Predictor0(data[-1], NULL);
+    AddPixelsEq(data, pred0);
+    for (x = 1; x < width; ++x) {
+      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
+      AddPixelsEq(data + x, pred1);
+    }
+    data += width;
+    ++y_start;
+  }
+
+  {
+    int y = y_start;
+    const int tile_width = 1 << transform->bits_;
+    const int mask = tile_width - 1;
+    const int safe_width = width & ~mask;
+    const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
+    const uint32_t* pred_mode_base =
+        transform->data_ + (y >> transform->bits_) * tiles_per_row;
+
+    while (y < y_end) {
+      const uint32_t pred2 = Predictor2(data[-1], data - width);
+      const uint32_t* pred_mode_src = pred_mode_base;
+      VP8LPredictorFunc pred_func;
+      int x = 1;
+      int t = 1;
+      // First pixel follows the T (mode=2) mode.
+      AddPixelsEq(data, pred2);
+      // .. the rest:
+      while (x < safe_width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; t < tile_width; ++t, ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
+        }
+        t = 0;
+      }
+      if (x < width) {
+        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
+        for (; x < width; ++x) {
+          const uint32_t pred = pred_func(data[x - 1], data + x - width);
+          AddPixelsEq(data + x, pred);
+        }
+      }
+      data += width;
+      ++y;
+      if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
+        pred_mode_base += tiles_per_row;
+      }
+    }
+  }
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = ((argb >> 8) & 0xff);
+    uint32_t red_blue = (argb & 0x00ff00ffu);
+    red_blue += (green << 16) | green;
+    red_blue &= 0x00ff00ffu;
+    data[i] = (argb & 0xff00ff00u) | red_blue;
+  }
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
+                                 int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
+    new_red += ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue += ColorTransformDelta(m->green_to_blue_, green);
+    new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+// Color space inverse transform.
+static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
+                                       int y_start, int y_end, uint32_t* data) {
+  const int width = transform->xsize_;
+  const int tile_width = 1 << transform->bits_;
+  const int mask = tile_width - 1;
+  const int safe_width = width & ~mask;
+  const int remaining_width = width - safe_width;
+  const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
+  int y = y_start;
+  const uint32_t* pred_row =
+      transform->data_ + (y >> transform->bits_) * tiles_per_row;
+
+  while (y < y_end) {
+    const uint32_t* pred = pred_row;
+    VP8LMultipliers m = { 0, 0, 0 };
+    const uint32_t* const data_safe_end = data + safe_width;
+    const uint32_t* const data_end = data + width;
+    while (data < data_safe_end) {
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, tile_width);
+      data += tile_width;
+    }
+    if (data < data_end) {  // Left-overs using C-version.
+      ColorCodeToMultipliers(*pred++, &m);
+      VP8LTransformColorInverse(&m, data, remaining_width);
+      data += remaining_width;
+    }
+    ++y;
+    if ((y & mask) == 0) pred_row += tiles_per_row;
+  }
+}
+
+// Separate out pixels packed together using pixel-bundling.
+// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
+#define COLOR_INDEX_INVERSE(FUNC_NAME, F_NAME, STATIC_DECL, TYPE, BIT_SUFFIX,  \
+                            GET_INDEX, GET_VALUE)                              \
+static void F_NAME(const TYPE* src, const uint32_t* const color_map,           \
+                   TYPE* dst, int y_start, int y_end, int width) {             \
+  int y;                                                                       \
+  for (y = y_start; y < y_end; ++y) {                                          \
+    int x;                                                                     \
+    for (x = 0; x < width; ++x) {                                              \
+      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
+    }                                                                          \
+  }                                                                            \
+}                                                                              \
+STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
+                           int y_start, int y_end, const TYPE* src,            \
+                           TYPE* dst) {                                        \
+  int y;                                                                       \
+  const int bits_per_pixel = 8 >> transform->bits_;                            \
+  const int width = transform->xsize_;                                         \
+  const uint32_t* const color_map = transform->data_;                          \
+  if (bits_per_pixel < 8) {                                                    \
+    const int pixels_per_byte = 1 << transform->bits_;                         \
+    const int count_mask = pixels_per_byte - 1;                                \
+    const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
+    for (y = y_start; y < y_end; ++y) {                                        \
+      uint32_t packed_pixels = 0;                                              \
+      int x;                                                                   \
+      for (x = 0; x < width; ++x) {                                            \
+        /* We need to load fresh 'packed_pixels' once every                */  \
+        /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
+        /* is a power of 2, so can just use a mask for that, instead of    */  \
+        /* decrementing a counter.                                         */  \
+        if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
+        *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
+        packed_pixels >>= bits_per_pixel;                                      \
+      }                                                                        \
+    }                                                                          \
+  } else {                                                                     \
+    VP8LMapColor##BIT_SUFFIX(src, color_map, dst, y_start, y_end, width);      \
+  }                                                                            \
+}
+
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
+                    VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
+                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef COLOR_INDEX_INVERSE
+
+void VP8LInverseTransform(const VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out) {
+  const int width = transform->xsize_;
+  assert(row_start < row_end);
+  assert(row_end <= transform->ysize_);
+  switch (transform->type_) {
+    case SUBTRACT_GREEN:
+      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
+      break;
+    case PREDICTOR_TRANSFORM:
+      PredictorInverseTransform(transform, row_start, row_end, out);
+      if (row_end != transform->ysize_) {
+        // The last predicted row in this iteration will be the top-pred row
+        // for the first row in next iteration.
+        memcpy(out - width, out + (row_end - row_start - 1) * width,
+               width * sizeof(*out));
+      }
+      break;
+    case CROSS_COLOR_TRANSFORM:
+      ColorSpaceInverseTransform(transform, row_start, row_end, out);
+      break;
+    case COLOR_INDEXING_TRANSFORM:
+      if (in == out && transform->bits_ > 0) {
+        // Move packed pixels to the end of unpacked region, so that unpacking
+        // can occur seamlessly.
+        // Also, note that this is the only transform that applies on
+        // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
+        // transforms work on effective width of xsize_.
+        const int out_stride = (row_end - row_start) * width;
+        const int in_stride = (row_end - row_start) *
+            VP8LSubSampleSize(transform->xsize_, transform->bits_);
+        uint32_t* const src = out + out_stride - in_stride;
+        memmove(src, out, in_stride * sizeof(*src));
+        ColorIndexInverseTransform(transform, row_start, row_end, src, out);
+      } else {
+        ColorIndexInverseTransform(transform, row_start, row_end, in, out);
+      }
+      break;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Color space conversion.
+
+static int is_big_endian(void) {
+  static const union {
+    uint16_t w;
+    uint8_t b[2];
+  } tmp = { 1 };
+  return (tmp.b[0] != 1);
+}
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >> 16) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >>  0) & 0xff;
+  }
+}
+
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >> 16) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >>  0) & 0xff;
+    *dst++ = (argb >> 24) & 0xff;
+  }
+}
+
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
+    const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
+#ifdef WEBP_SWAP_16BIT_CSP
+    *dst++ = ba;
+    *dst++ = rg;
+#else
+    *dst++ = rg;
+    *dst++ = ba;
+#endif
+  }
+}
+
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
+    const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
+#ifdef WEBP_SWAP_16BIT_CSP
+    *dst++ = gb;
+    *dst++ = rg;
+#else
+    *dst++ = rg;
+    *dst++ = gb;
+#endif
+  }
+}
+
+void VP8LConvertBGRAToBGR_C(const uint32_t* src,
+                            int num_pixels, uint8_t* dst) {
+  const uint32_t* const src_end = src + num_pixels;
+  while (src < src_end) {
+    const uint32_t argb = *src++;
+    *dst++ = (argb >>  0) & 0xff;
+    *dst++ = (argb >>  8) & 0xff;
+    *dst++ = (argb >> 16) & 0xff;
+  }
+}
+
+static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
+                       int swap_on_big_endian) {
+  if (is_big_endian() == swap_on_big_endian) {
+    const uint32_t* const src_end = src + num_pixels;
+    while (src < src_end) {
+      const uint32_t argb = *src++;
+
+#if !defined(WORDS_BIGENDIAN)
+#if !defined(WEBP_REFERENCE_IMPLEMENTATION)
+      *(uint32_t*)dst = BSwap32(argb);
+#else  // WEBP_REFERENCE_IMPLEMENTATION
+      dst[0] = (argb >> 24) & 0xff;
+      dst[1] = (argb >> 16) & 0xff;
+      dst[2] = (argb >>  8) & 0xff;
+      dst[3] = (argb >>  0) & 0xff;
+#endif
+#else  // WORDS_BIGENDIAN
+      dst[0] = (argb >>  0) & 0xff;
+      dst[1] = (argb >>  8) & 0xff;
+      dst[2] = (argb >> 16) & 0xff;
+      dst[3] = (argb >> 24) & 0xff;
+#endif
+      dst += sizeof(argb);
+    }
+  } else {
+    memcpy(dst, src, num_pixels * sizeof(*src));
+  }
+}
+
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
+  switch (out_colorspace) {
+    case MODE_RGB:
+      VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
+      break;
+    case MODE_RGBA:
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      break;
+    case MODE_rgbA:
+      VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
+      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
+      break;
+    case MODE_BGR:
+      VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
+      break;
+    case MODE_BGRA:
+      CopyOrSwap(in_data, num_pixels, rgba, 1);
+      break;
+    case MODE_bgrA:
+      CopyOrSwap(in_data, num_pixels, rgba, 1);
+      WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
+      break;
+    case MODE_ARGB:
+      CopyOrSwap(in_data, num_pixels, rgba, 0);
+      break;
+    case MODE_Argb:
+      CopyOrSwap(in_data, num_pixels, rgba, 0);
+      WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
+      break;
+    case MODE_RGBA_4444:
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      break;
+    case MODE_rgbA_4444:
+      VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
+      WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
+      break;
+    case MODE_RGB_565:
+      VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
+      break;
+    default:
+      assert(0);          // Code flow should not reach here.
+  }
+}
+
+//------------------------------------------------------------------------------
+
+VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorFunc VP8LPredictors[16];
+
+VP8LTransformColorFunc VP8LTransformColorInverse;
+
+VP8LConvertFunc VP8LConvertBGRAToRGB;
+VP8LConvertFunc VP8LConvertBGRAToRGBA;
+VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+VP8LConvertFunc VP8LConvertBGRAToRGB565;
+VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+VP8LMapARGBFunc VP8LMapColor32b;
+VP8LMapAlphaFunc VP8LMapColor8b;
+
+extern void VP8LDspInitSSE2(void);
+extern void VP8LDspInitNEON(void);
+extern void VP8LDspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo lossless_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
+  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8LPredictors[0] = Predictor0;
+  VP8LPredictors[1] = Predictor1;
+  VP8LPredictors[2] = Predictor2;
+  VP8LPredictors[3] = Predictor3;
+  VP8LPredictors[4] = Predictor4;
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+  VP8LPredictors[14] = Predictor0;     // <- padding security sentinels
+  VP8LPredictors[15] = Predictor0;
+
+  VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
+
+  VP8LTransformColorInverse = VP8LTransformColorInverse_C;
+
+  VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
+  VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
+  VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
+  VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
+  VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
+
+  VP8LMapColor32b = MapARGB;
+  VP8LMapColor8b = MapAlpha;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8LDspInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8LDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8LDspInitMIPSdspR2();
+    }
+#endif
+  }
+  lossless_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/dsp/lossless.h b/src/loaders/webp/dsp/lossless.h
new file mode 100644
index 00000000..48b360e8
--- /dev/null
+++ b/src/loaders/webp/dsp/lossless.h
@@ -0,0 +1,303 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_H_
+#define WEBP_DSP_LOSSLESS_H_
+
+#include "../webp/types.h"
+#include "../webp/decode.h"
+
+// #include "../enc/histogram.h"
+#include "../utils/utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Not a trivial literal symbol.
+#define VP8L_NON_TRIVIAL_SYM (0xffffffff)
+
+//------------------------------------------------------------------------------
+// Decoding
+
+typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
+extern VP8LPredictorFunc VP8LPredictors[16];
+
+typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
+extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+typedef struct {
+  // Note: the members are uint8_t, so that any negative values are
+  // automatically converted to "mod 256" values.
+  uint8_t green_to_red_;
+  uint8_t green_to_blue_;
+  uint8_t red_to_blue_;
+} VP8LMultipliers;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+                                       uint32_t* argb_data, int num_pixels);
+extern VP8LTransformColorFunc VP8LTransformColorInverse;
+
+struct VP8LTransform;  // Defined in dec/vp8li.h.
+
+// Performs inverse transform of data given transform information, start and end
+// rows. Transform will be applied to rows [row_start, row_end[.
+// The *in and *out pointers refer to source and destination data respectively
+// corresponding to the intermediate row (row_start).
+void VP8LInverseTransform(const struct VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out);
+
+// Color space conversion.
+typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
+                                uint8_t* dst);
+extern VP8LConvertFunc VP8LConvertBGRAToRGB;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
+extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
+extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
+extern VP8LConvertFunc VP8LConvertBGRAToBGR;
+
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+  return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+  return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+  return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+  return (val >> 8) & 0xff;
+}
+
+typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
+                                const uint32_t* const color_map,
+                                uint32_t* dst, int y_start,
+                                int y_end, int width);
+typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
+                                 const uint32_t* const color_map,
+                                 uint8_t* dst, int y_start,
+                                 int y_end, int width);
+
+extern VP8LMapARGBFunc VP8LMapColor32b;
+extern VP8LMapAlphaFunc VP8LMapColor8b;
+
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+    const struct VP8LTransform* const transform, int y_start, int y_end,
+    const uint8_t* src, uint8_t* dst);
+
+// Expose some C-only fallback functions
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 uint32_t* data, int num_pixels);
+
+void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
+                                 int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
+                               int num_pixels, uint8_t* dst);
+void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
+void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
+//------------------------------------------------------------------------------
+// Encoding
+
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LTransformColorFunc VP8LTransformColor;
+typedef void (*VP8LCollectColorBlueTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_blue, int red_to_blue, int histo[]);
+extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+
+typedef void (*VP8LCollectColorRedTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_red, int histo[]);
+extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+
+//------------------------------------------------------------------------------
+// Image transforms.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// -----------------------------------------------------------------------------
+// Faster logarithm for integers. Small values use a look-up table.
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// Huffman-cost related functions.
+
+typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
+typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
+                                       int length);
+
+extern VP8LCostFunc VP8LExtraCost;
+extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+
+typedef struct {        // small struct to hold counters
+  int counts[2];        // index: 0=zero steak, 1=non-zero streak
+  int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
+} VP8LStreaks;
+
+typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
+                                         int length);
+typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
+                                                 const uint32_t* Y, int length);
+
+extern VP8LCostCountFunc VP8LHuffmanCostCount;
+extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
+
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+double VP8LPopulationCost(const uint32_t* const population, int length,
+                          uint32_t* const trivial_sym);
+
+// Get the combined symbol entropy for the distributions 'X' and 'Y'.
+double VP8LGetCombinedEntropy(const uint32_t* const X,
+                              const uint32_t* const Y, int length);
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}
+
+// In-place difference of each component with mod 256.
+static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst);
+
+// Must be called before calling any of the above methods.
+void VP8LEncDspInit(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_H_
diff --git a/src/loaders/webp/dsp/meson.build b/src/loaders/webp/dsp/meson.build
new file mode 100644
index 00000000..9f9eb593
--- /dev/null
+++ b/src/loaders/webp/dsp/meson.build
@@ -0,0 +1,20 @@
+source_file = [
+   'yuv.h',
+   'dsp.h',
+   'lossless.h',
+   'alpha_processing.cpp',
+   'argb.cpp',
+   'cpu.cpp',
+   'dec.cpp',
+   'dec_clip_tables.cpp',
+   'filters.cpp',
+   'lossless.cpp',
+   'rescaler.cpp',
+   'upsampling.cpp',
+   'yuv.cpp'
+]
+
+webp_deb += [declare_dependency(
+   include_directories : include_directories('.'),
+   sources : source_file
+)]
\ No newline at end of file
diff --git a/src/loaders/webp/dsp/rescaler.cpp b/src/loaders/webp/dsp/rescaler.cpp
new file mode 100644
index 00000000..640b462c
--- /dev/null
+++ b/src/loaders/webp/dsp/rescaler.cpp
@@ -0,0 +1,115 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+
+#include "./dsp.h"
+#include "../utils/rescaler.h"
+
+//------------------------------------------------------------------------------
+// Implementations of critical functions ImportRow / ExportRow
+
+#define ROUNDER (1 << (WEBP_RESCALER_RFIX - 1))
+#define MULT_FIX(x, y) (((int64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+static void RescalerImportRowC(WebPRescaler* const wrk,
+                               const uint8_t* const src, int channel) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int x_in = channel;
+  int x_out;
+  int accum = 0;
+  if (!wrk->x_expand) {
+    int sum = 0;
+    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+      accum += wrk->x_add;
+      for (; accum > 0; accum -= wrk->x_sub) {
+        sum += src[x_in];
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const int32_t base = src[x_in];
+        const int32_t frac = base * (-accum);
+        x_in += x_stride;
+        wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+    }
+  } else {        // simple bilinear interpolation
+    int left = src[channel], right = src[channel];
+    for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+      if (accum < 0) {
+        left = right;
+        x_in += x_stride;
+        right = src[x_in];
+        accum += wrk->x_add;
+      }
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      accum -= wrk->x_sub;
+    }
+  }
+  // Accumulate the contribution of the new row.
+  for (x_out = channel; x_out < x_out_max; x_out += x_stride) {
+    wrk->irow[x_out] += wrk->frow[x_out];
+  }
+}
+
+void WebPRescalerExportRowC(WebPRescaler* const wrk, int x_out) {
+  if (wrk->y_accum <= 0) {
+    uint8_t* const dst = wrk->dst;
+    int32_t* const irow = wrk->irow;
+    const int32_t* const frow = wrk->frow;
+    const int yscale = wrk->fy_scale * (-wrk->y_accum);
+    const int x_out_max = wrk->dst_width * wrk->num_channels;
+    for (; x_out < x_out_max; ++x_out) {
+      const int frac = (int)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
+      irow[x_out] = frac;   // new fractional start
+    }
+    wrk->y_accum += wrk->y_add;
+    wrk->dst += wrk->dst_stride;
+  }
+}
+
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+
+void (*WebPRescalerImportRow)(struct WebPRescaler* const wrk,
+                              const uint8_t* const src, int channel);
+void (*WebPRescalerExportRow)(struct WebPRescaler* const wrk, int x_out);
+
+extern void WebPRescalerDspInitMIPS32(void);
+extern void WebPRescalerDspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
+    (VP8CPUInfo)&rescaler_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
+  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPRescalerImportRow = RescalerImportRowC;
+  WebPRescalerExportRow = WebPRescalerExportRowC;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      WebPRescalerDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPRescalerDspInitMIPSdspR2();
+    }
+#endif
+  }
+  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/loaders/webp/dsp/upsampling.cpp b/src/loaders/webp/dsp/upsampling.cpp
new file mode 100644
index 00000000..151a32ee
--- /dev/null
+++ b/src/loaders/webp/dsp/upsampling.cpp
@@ -0,0 +1,252 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "./dsp.h"
+#include "./yuv.h"
+
+#include <assert.h>
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Fancy upsampling functions to convert YUV to RGB
+WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    {                                                                          \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    {                                                                          \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,  VP8YuvToRgb565,  2)
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+#if !defined(FANCY_UPSAMPLING)
+#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC)                                      \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* bot_u, const uint8_t* bot_v,              \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
+  const int half_len = len >> 1;                                               \
+  int x;                                                                       \
+  assert(top_dst != NULL);                                                     \
+  {                                                                            \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
+      FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x);  \
+  }                                                                            \
+  if (bot_dst != NULL) {                                                       \
+    for (x = 0; x < half_len; ++x) {                                           \
+      FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0);         \
+      FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4);         \
+    }                                                                          \
+    if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x);  \
+  }                                                                            \
+}
+
+DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
+DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
+#undef DUAL_SAMPLE_FUNC
+
+#endif  // !FANCY_UPSAMPLING
+
+WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
+  WebPInitUpsamplers();
+  VP8YUVInit();
+#ifdef FANCY_UPSAMPLING
+  return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
+#else
+  return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
+#endif
+}
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
+YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
+YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
+YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
+YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
+YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
+
+#undef YUV444_FUNC
+
+WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
+  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
+
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitYUV444ConvertersMIPSdspR2();
+    }
+#endif
+  }
+  upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
+// Main calls
+
+extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersNEON(void);
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
+  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
+
+#ifdef FANCY_UPSAMPLING
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitUpsamplersSSE2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitUpsamplersNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitUpsamplersMIPSdspR2();
+    }
+#endif
+  }
+#endif  // FANCY_UPSAMPLING
+  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/dsp/yuv.cpp b/src/loaders/webp/dsp/yuv.cpp
new file mode 100644
index 00000000..086a9257
--- /dev/null
+++ b/src/loaders/webp/dsp/yuv.cpp
@@ -0,0 +1,166 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./yuv.h"
+
+#if defined(WEBP_YUV_USE_TABLE)
+
+static int done = 0;
+
+static WEBP_INLINE uint8_t clip(int v, int max_value) {
+  return v < 0 ? 0 : v > max_value ? max_value : v;
+}
+
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
+  int i;
+  if (done) {
+    return;
+  }
+#ifndef USE_YUVj
+  for (i = 0; i < 256; ++i) {
+    VP8kVToR[i] = (89858 * (i - 128) + YUV_HALF) >> YUV_FIX;
+    VP8kUToG[i] = -22014 * (i - 128) + YUV_HALF;
+    VP8kVToG[i] = -45773 * (i - 128);
+    VP8kUToB[i] = (113618 * (i - 128) + YUV_HALF) >> YUV_FIX;
+  }
+  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+    const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX;
+    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
+  }
+#else
+  for (i = 0; i < 256; ++i) {
+    VP8kVToR[i] = (91881 * (i - 128) + YUV_HALF) >> YUV_FIX;
+    VP8kUToG[i] = -22554 * (i - 128) + YUV_HALF;
+    VP8kVToG[i] = -46802 * (i - 128);
+    VP8kUToB[i] = (116130 * (i - 128) + YUV_HALF) >> YUV_FIX;
+  }
+  for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) {
+    const int k = i;
+    VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255);
+    VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15);
+  }
+#endif
+
+  done = 1;
+}
+
+#else
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
+
+#endif  // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Plain-C version
+
+#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP)                                       \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  const uint8_t* const end = dst + (len & ~1) * XSTEP;                         \
+  while (dst != end) {                                                         \
+    FUNC(y[0], u[0], v[0], dst);                                               \
+    FUNC(y[1], u[0], v[0], dst + XSTEP);                                       \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    FUNC(y[0], u[0], v[0], dst);                                               \
+  }                                                                            \
+}                                                                              \
+
+// All variants implemented.
+ROW_FUNC(YuvToRgbRow,      VP8YuvToRgb,  3)
+ROW_FUNC(YuvToBgrRow,      VP8YuvToBgr,  3)
+ROW_FUNC(YuvToRgbaRow,     VP8YuvToRgba, 4)
+ROW_FUNC(YuvToBgraRow,     VP8YuvToBgra, 4)
+ROW_FUNC(YuvToArgbRow,     VP8YuvToArgb, 4)
+ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
+ROW_FUNC(YuvToRgb565Row,   VP8YuvToRgb565, 2)
+
+#undef ROW_FUNC
+
+// Main call for processing a plane with a WebPSamplerRowFunc function:
+void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
+                             const uint8_t* u, const uint8_t* v, int uv_stride,
+                             uint8_t* dst, int dst_stride,
+                             int width, int height, WebPSamplerRowFunc func) {
+  int j;
+  for (j = 0; j < height; ++j) {
+    func(y, u, v, dst, width);
+    y += y_stride;
+    if (j & 1) {
+      u += uv_stride;
+      v += uv_stride;
+    }
+    dst += dst_stride;
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Main call
+
+WebPSamplerRowFunc WebPSamplers[MODE_LAST];
+
+extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersMIPS32(void);
+extern void WebPInitSamplersMIPSdspR2(void);
+
+static volatile VP8CPUInfo yuv_last_cpuinfo_used =
+    (VP8CPUInfo)&yuv_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
+  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPSamplers[MODE_RGB]       = YuvToRgbRow;
+  WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
+  WebPSamplers[MODE_BGR]       = YuvToBgrRow;
+  WebPSamplers[MODE_BGRA]      = YuvToBgraRow;
+  WebPSamplers[MODE_ARGB]      = YuvToArgbRow;
+  WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
+  WebPSamplers[MODE_RGB_565]   = YuvToRgb565Row;
+  WebPSamplers[MODE_rgbA]      = YuvToRgbaRow;
+  WebPSamplers[MODE_bgrA]      = YuvToBgraRow;
+  WebPSamplers[MODE_Argb]      = YuvToArgbRow;
+  WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitSamplersSSE2();
+    }
+#endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      WebPInitSamplersMIPS32();
+    }
+#endif  // WEBP_USE_MIPS32
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitSamplersMIPSdspR2();
+    }
+#endif  // WEBP_USE_MIPS_DSP_R2
+  }
+  yuv_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//-----------------------------------------------------------------------------
diff --git a/src/loaders/webp/dsp/yuv.h b/src/loaders/webp/dsp/yuv.h
new file mode 100644
index 00000000..8a47edd8
--- /dev/null
+++ b/src/loaders/webp/dsp/yuv.h
@@ -0,0 +1,321 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// inline YUV<->RGB conversion function
+//
+// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
+// More information at: http://en.wikipedia.org/wiki/YCbCr
+// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
+// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
+// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
+//
+// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
+//   R = 1.164 * (Y-16) + 1.596 * (V-128)
+//   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
+//   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
+// where Y is in the [16,235] range, and U/V in the [16,240] range.
+// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
+// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
+// So in this case the formulae should read:
+//   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
+//   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
+//   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
+// once factorized.
+// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
+// That's the maximum possible for a convenient ARM implementation.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_DSP_YUV_H_
+#define WEBP_DSP_YUV_H_
+
+#include "./dsp.h"
+#include "../dec/decode_vp8.h"
+
+// Define the following to use the LUT-based code:
+// #define WEBP_YUV_USE_TABLE
+
+#if defined(WEBP_EXPERIMENTAL_FEATURES)
+// Do NOT activate this feature for real compression. This is only experimental!
+// This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
+// This colorspace is close to Rec.601's Y'CbCr model with the notable
+// difference of allowing larger range for luma/chroma.
+// See http://en.wikipedia.org/wiki/YCbCr#JPEG_conversion paragraph, and its
+// difference with http://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
+// #define USE_YUVj
+#endif
+
+//------------------------------------------------------------------------------
+// YUV -> RGB conversion
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
+  YUV_HALF = 1 << (YUV_FIX - 1),
+  YUV_MASK = (256 << YUV_FIX) - 1,
+  YUV_RANGE_MIN = -227,            // min value of r/g/b output
+  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
+
+  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
+  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+  YUV_MASK2 = (256 << YUV_FIX2) - 1
+};
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+#define kYScale 19077    // 1.164 = 255 / 219
+#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
+#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
+
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_YUV_USE_TABLE)
+
+// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+
+static WEBP_INLINE int VP8Clip8(int v) {
+  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
+}
+
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+                                    uint8_t* const rgb) {
+  rgb[0] = VP8YUVToR(y, v);
+  rgb[1] = VP8YUVToG(y, u, v);
+  rgb[2] = VP8YUVToB(y, u);
+}
+
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+                                    uint8_t* const bgr) {
+  bgr[0] = VP8YUVToB(y, u);
+  bgr[1] = VP8YUVToG(y, u, v);
+  bgr[2] = VP8YUVToR(y, v);
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+                                       uint8_t* const rgb) {
+  const int r = VP8YUVToR(y, v);      // 5 usable bits
+  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
+  const int b = VP8YUVToB(y, u);      // 5 usable bits
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+                                         uint8_t* const argb) {
+  const int r = VP8YUVToR(y, v);        // 4 usable bits
+  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
+  const int b = VP8YUVToB(y, u);        // 4 usable bits
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#else
+
+// Table-based version, not totally equivalent to the SSE2 version.
+// Rounding diff is only +/-1 though.
+
+extern int16_t VP8kVToR[256], VP8kUToB[256];
+extern int32_t VP8kVToG[256], VP8kUToG[256];
+extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
+                                    uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+}
+
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
+                                    uint8_t* const bgr) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+}
+
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
+                                       uint8_t* const rgb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
+                                         uint8_t* const argb) {
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+#endif  // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const argb) {
+  argb[0] = 0xff;
+  VP8YuvToRgb(y, u, v, argb + 1);
+}
+
+static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const bgra) {
+  VP8YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                     uint8_t* const rgba) {
+  VP8YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+// Must be called before everything, to initialize the tables.
+void VP8YUVInit(void);
+
+//-----------------------------------------------------------------------------
+// SSE2 extra functions (mostly for upsampling_sse2.c)
+
+#if defined(WEBP_USE_SSE2)
+
+// When the following is defined, tables are initialized statically, adding ~12k
+// to the binary size. Otherwise, they are initialized at run-time (small cost).
+#define WEBP_YUV_USE_SSE2_TABLES
+
+#if defined(FANCY_UPSAMPLING)
+// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+#endif  // FANCY_UPSAMPLING
+
+// Must be called to initialize tables before using the functions.
+void VP8YUVInitSSE2(void);
+
+#endif    // WEBP_USE_SSE2
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+// Stub functions that can be called with various rounding values:
+static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
+  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
+  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
+}
+
+#ifndef USE_YUVj
+
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b;
+  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+  const int u = -9719 * r - 19081 * g + 28800 * b;
+  return VP8ClipUV(u, rounding);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+  const int v = +28800 * r - 24116 * g - 4684 * b;
+  return VP8ClipUV(v, rounding);
+}
+
+#else
+
+// This JPEG-YUV colorspace, only for comparison!
+// These are also 16bit precision coefficients from Rec.601, but with full
+// [0..255] output range.
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
+  const int luma = 19595 * r + 38470 * g + 7471 * b;
+  return (luma + rounding) >> YUV_FIX;  // no need to clip
+}
+
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
+  const int u = -11058 * r - 21710 * g + 32768 * b;
+  return VP8ClipUV(u, rounding);
+}
+
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
+  const int v = 32768 * r - 27439 * g - 5329 * b;
+  return VP8ClipUV(v, rounding);
+}
+
+#endif    // USE_YUVj
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DSP_YUV_H_ */
diff --git a/src/loaders/webp/meson.build b/src/loaders/webp/meson.build
new file mode 100644
index 00000000..5b069da6
--- /dev/null
+++ b/src/loaders/webp/meson.build
@@ -0,0 +1,17 @@
+webp_deb = []
+
+subdir('dec')
+subdir('dsp')
+subdir('utils')
+subdir('webp')
+
+source_file = [
+   'tvgWebpLoader.h',
+   'tvgWebpLoader.cpp',
+]
+
+subloader_dep += [declare_dependency(
+    include_directories : include_directories('.'),
+    dependencies : webp_deb,
+    sources : source_file
+    )]
diff --git a/src/loaders/webp/tvgWebpLoader.cpp b/src/loaders/webp/tvgWebpLoader.cpp
new file mode 100644
index 00000000..c9a7eb62
--- /dev/null
+++ b/src/loaders/webp/tvgWebpLoader.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2024 the ThorVG project. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "webp/decode.h"
+#include "tvgWebpLoader.h"
+
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+void WebpLoader::clear()
+{
+    if (freeData) free(data);
+    data = nullptr;
+    freeData = false;
+}
+
+
+void WebpLoader::run(unsigned tid)
+{
+    //TODO: we can figure out the requested image format in advance.
+    surface.buf8 = WebPDecodeBGRA(data, size, nullptr, nullptr);
+    surface.stride = static_cast<uint32_t>(w);
+    surface.w = static_cast<uint32_t>(w);
+    surface.h = static_cast<uint32_t>(h);
+    surface.cs = ColorSpace::ARGB8888;
+    surface.channelSize = sizeof(uint32_t);
+    surface.premultiplied = false;
+
+    clear();
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+WebpLoader::WebpLoader() : ImageLoader(FileType::Webp)
+{
+}
+
+
+WebpLoader::~WebpLoader()
+{
+    clear();
+    free(surface.buf8);
+}
+
+
+bool WebpLoader::open(const string& path)
+{
+    auto f = fopen(path.c_str(), "rb");
+    if (!f) return false;
+
+    fseek(f, 0, SEEK_END);
+
+    size = ftell(f);
+    if (size == 0) {
+        fclose(f);
+        return false;
+    }
+
+    data = (uint8_t*)malloc(size);
+
+    fseek(f, 0, SEEK_SET);
+    auto ret = fread(data, sizeof(char), size, f);
+    if (ret < size) {
+        fclose(f);
+        return false;
+    }
+
+    fclose(f);
+
+    int width, height;
+    if (!WebPGetInfo(data, size, &width, &height)) return false;
+
+    w = static_cast<float>(width);
+    h = static_cast<float>(height);
+    freeData = true;
+
+    return true;
+}
+
+
+bool WebpLoader::open(const char* data, uint32_t size, bool copy)
+{
+    if (copy) {
+        this->data = (uint8_t*) malloc(size);
+        if (!this->data) return false;
+        memcpy((uint8_t*)this->data, data, size);
+        freeData = true;
+    } else {
+        this->data = (uint8_t*) data;
+        freeData = false;
+    }
+
+    int width, height;
+    if (!WebPGetInfo(this->data, size, &width, &height)) return false;
+
+    w = static_cast<float>(width);
+    h = static_cast<float>(height);
+    this->size = size;
+
+    return true;
+}
+
+
+bool WebpLoader::read()
+{
+    if (!LoadModule::read()) return true;
+
+    if (!data || w == 0 || h == 0) return false;
+
+    TaskScheduler::request(this);
+
+    return true;
+}
+
+
+bool WebpLoader::close()
+{
+    if (!LoadModule::close()) return false;
+    this->done();
+    return true;
+}
+
+
+Surface* WebpLoader::bitmap()
+{
+    this->done();
+
+    return ImageLoader::bitmap();
+}
\ No newline at end of file
diff --git a/src/loaders/webp/tvgWebpLoader.h b/src/loaders/webp/tvgWebpLoader.h
new file mode 100644
index 00000000..61655761
--- /dev/null
+++ b/src/loaders/webp/tvgWebpLoader.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 the ThorVG project. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TVG_WEBP_LOADER_H_
+#define _TVG_WEBP_LOADER_H_
+
+#include "tvgLoader.h"
+#include "tvgTaskScheduler.h"
+
+class WebpLoader : public ImageLoader, public Task
+{
+private:
+    uint8_t* data = nullptr;
+    uint32_t size = 0;
+    bool freeData = false;
+
+    void clear();
+    void run(unsigned tid) override;
+
+public:
+    WebpLoader();
+    ~WebpLoader();
+
+    bool open(const string& path) override;
+    bool open(const char* data, uint32_t size, bool copy) override;
+    bool read() override;
+    bool close() override;
+
+    Surface* bitmap() override;
+};
+
+#endif //_TVG_WEBP_LOADER_H_
\ No newline at end of file
diff --git a/src/loaders/webp/utils/bit_reader.cpp b/src/loaders/webp/utils/bit_reader.cpp
new file mode 100644
index 00000000..a268c646
--- /dev/null
+++ b/src/loaders/webp/utils/bit_reader.cpp
@@ -0,0 +1,216 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder non-inlined methods
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "./bit_reader_inl.h"
+
+//------------------------------------------------------------------------------
+// VP8BitReader
+
+void VP8InitBitReader(VP8BitReader* const br,
+                      const uint8_t* const start, const uint8_t* const end) {
+  assert(br != NULL);
+  assert(start != NULL);
+  assert(start <= end);
+  br->range_   = 255 - 1;
+  br->buf_     = start;
+  br->buf_end_ = end;
+  br->value_   = 0;
+  br->bits_    = -8;   // to load the very first 8bits
+  br->eof_     = 0;
+  VP8LoadNewBytes(br);
+}
+
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset) {
+  if (br->buf_ != NULL) {
+    br->buf_ += offset;
+    br->buf_end_ += offset;
+  }
+}
+
+const uint8_t kVP8Log2Range[128] = {
+     7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0
+};
+
+// range = ((range - 1) << kVP8Log2Range[range]) + 1
+const uint8_t kVP8NewRange[128] = {
+  127, 127, 191, 127, 159, 191, 223, 127,
+  143, 159, 175, 191, 207, 223, 239, 127,
+  135, 143, 151, 159, 167, 175, 183, 191,
+  199, 207, 215, 223, 231, 239, 247, 127,
+  131, 135, 139, 143, 147, 151, 155, 159,
+  163, 167, 171, 175, 179, 183, 187, 191,
+  195, 199, 203, 207, 211, 215, 219, 223,
+  227, 231, 235, 239, 243, 247, 251, 127,
+  129, 131, 133, 135, 137, 139, 141, 143,
+  145, 147, 149, 151, 153, 155, 157, 159,
+  161, 163, 165, 167, 169, 171, 173, 175,
+  177, 179, 181, 183, 185, 187, 189, 191,
+  193, 195, 197, 199, 201, 203, 205, 207,
+  209, 211, 213, 215, 217, 219, 221, 223,
+  225, 227, 229, 231, 233, 235, 237, 239,
+  241, 243, 245, 247, 249, 251, 253, 127
+};
+
+void VP8LoadFinalBytes(VP8BitReader* const br) {
+  assert(br != NULL && br->buf_ != NULL);
+  // Only read 8bits at a time
+  if (br->buf_ < br->buf_end_) {
+    br->bits_ += 8;
+    br->value_ = (bit_t)(*br->buf_++) | (br->value_ << 8);
+  } else if (!br->eof_) {
+    br->value_ <<= 8;
+    br->bits_ += 8;
+    br->eof_ = 1;
+  } else {
+    br->bits_ = 0;  // This is to avoid undefined behaviour with shifts.
+  }
+}
+
+//------------------------------------------------------------------------------
+// Higher-level calls
+
+uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
+  uint32_t v = 0;
+  while (bits-- > 0) {
+    v |= VP8GetBit(br, 0x80) << bits;
+  }
+  return v;
+}
+
+int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
+  const int value = VP8GetValue(br, bits);
+  return VP8Get(br) ? -value : value;
+}
+
+//------------------------------------------------------------------------------
+// VP8LBitReader
+
+#define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
+
+#if !defined(WEBP_FORCE_ALIGNED) && \
+    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+     defined(__i386__) || defined(_M_IX86) || \
+     defined(__x86_64__) || defined(_M_X64))
+#define VP8L_USE_UNALIGNED_LOAD
+#endif
+
+static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
+  0,
+  0x000001, 0x000003, 0x000007, 0x00000f,
+  0x00001f, 0x00003f, 0x00007f, 0x0000ff,
+  0x0001ff, 0x0003ff, 0x0007ff, 0x000fff,
+  0x001fff, 0x003fff, 0x007fff, 0x00ffff,
+  0x01ffff, 0x03ffff, 0x07ffff, 0x0fffff,
+  0x1fffff, 0x3fffff, 0x7fffff, 0xffffff
+};
+
+void VP8LInitBitReader(VP8LBitReader* const br, const uint8_t* const start,
+                       size_t length) {
+  size_t i;
+  vp8l_val_t value = 0;
+  assert(br != NULL);
+  assert(start != NULL);
+  assert(length < 0xfffffff8u);   // can't happen with a RIFF chunk.
+
+  br->len_ = length;
+  br->val_ = 0;
+  br->bit_pos_ = 0;
+  br->eos_ = 0;
+
+  if (length > sizeof(br->val_)) {
+    length = sizeof(br->val_);
+  }
+  for (i = 0; i < length; ++i) {
+    value |= (vp8l_val_t)start[i] << (8 * i);
+  }
+  br->val_ = value;
+  br->pos_ = length;
+  br->buf_ = start;
+}
+
+void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
+                            const uint8_t* const buf, size_t len) {
+  assert(br != NULL);
+  assert(buf != NULL);
+  assert(len < 0xfffffff8u);   // can't happen with a RIFF chunk.
+  br->buf_ = buf;
+  br->len_ = len;
+  // pos_ > len_ should be considered a param error.
+  br->eos_ = (br->pos_ > br->len_) || VP8LIsEndOfStream(br);
+}
+
+static void VP8LSetEndOfStream(VP8LBitReader* const br) {
+  br->eos_ = 1;
+  br->bit_pos_ = 0;  // To avoid undefined behaviour with shifts.
+}
+
+// If not at EOS, reload up to VP8L_LBITS byte-by-byte
+static void ShiftBytes(VP8LBitReader* const br) {
+  while (br->bit_pos_ >= 8 && br->pos_ < br->len_) {
+    br->val_ >>= 8;
+    br->val_ |= ((vp8l_val_t)br->buf_[br->pos_]) << (VP8L_LBITS - 8);
+    ++br->pos_;
+    br->bit_pos_ -= 8;
+  }
+  if (VP8LIsEndOfStream(br)) {
+    VP8LSetEndOfStream(br);
+  }
+}
+
+void VP8LDoFillBitWindow(VP8LBitReader* const br) {
+  assert(br->bit_pos_ >= VP8L_WBITS);
+  // TODO(jzern): given the fixed read size it may be possible to force
+  //              alignment in this block.
+#if defined(VP8L_USE_UNALIGNED_LOAD)
+  if (br->pos_ + sizeof(br->val_) < br->len_) {
+    br->val_ >>= VP8L_WBITS;
+    br->bit_pos_ -= VP8L_WBITS;
+    // The expression below needs a little-endian arch to work correctly.
+    // This gives a large speedup for decoding speed.
+    br->val_ |= (vp8l_val_t)*(const uint32_t*)(br->buf_ + br->pos_) <<
+                (VP8L_LBITS - VP8L_WBITS);
+    br->pos_ += VP8L_LOG8_WBITS;
+    return;
+  }
+#endif
+  ShiftBytes(br);       // Slow path.
+}
+
+uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
+  assert(n_bits >= 0);
+  // Flag an error if end_of_stream or n_bits is more than allowed limit.
+  if (!br->eos_ && n_bits <= VP8L_MAX_NUM_BIT_READ) {
+    const uint32_t val = VP8LPrefetchBits(br) & kBitMask[n_bits];
+    const int new_bits = br->bit_pos_ + n_bits;
+    br->bit_pos_ = new_bits;
+    ShiftBytes(br);
+    return val;
+  } else {
+    VP8LSetEndOfStream(br);
+    return 0;
+  }
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/utils/bit_reader.h b/src/loaders/webp/utils/bit_reader.h
new file mode 100644
index 00000000..4ebbe539
--- /dev/null
+++ b/src/loaders/webp/utils/bit_reader.h
@@ -0,0 +1,168 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Boolean decoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+//         Vikas Arora (vikaas.arora@gmail.com)
+
+#ifndef WEBP_UTILS_BIT_READER_H_
+#define WEBP_UTILS_BIT_READER_H_
+
+#include <assert.h>
+#ifdef _MSC_VER
+#include <stdlib.h>  // _byteswap_ulong
+#endif
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The Boolean decoder needs to maintain infinite precision on the value_ field.
+// However, since range_ is only 8bit, we only need an active window of 8 bits
+// for value_. Left bits (MSB) gets zeroed and shifted away when value_ falls
+// below 128, range_ is updated, and fresh bits read from the bitstream are
+// brought in as LSB. To avoid reading the fresh bits one by one (slow), we
+// cache BITS of them ahead. The total of (BITS + 8) bits must fit into a
+// natural register (with type bit_t). To fetch BITS bits from bitstream we
+// use a type lbit_t.
+//
+// BITS can be any multiple of 8 from 8 to 56 (inclusive).
+// Pick values that fit natural register size.
+
+#if defined(__i386__) || defined(_M_IX86)      // x86 32bit
+#define BITS 24
+#elif defined(__x86_64__) || defined(_M_X64)   // x86 64bit
+#define BITS 56
+#elif defined(__arm__) || defined(_M_ARM)      // ARM
+#define BITS 24
+#elif defined(__mips__)                        // MIPS
+#define BITS 24
+#else                                          // reasonable default
+#define BITS 24  // TODO(skal): test aarch64 and find the proper BITS value.
+#endif
+
+//------------------------------------------------------------------------------
+// Derived types and constants:
+//   bit_t = natural register type for storing 'value_' (which is BITS+8 bits)
+//   range_t = register for 'range_' (which is 8bits only)
+
+#if (BITS > 24)
+typedef uint64_t bit_t;
+#else
+typedef uint32_t bit_t;
+#endif
+
+typedef uint32_t range_t;
+
+//------------------------------------------------------------------------------
+// Bitreader
+
+typedef struct VP8BitReader VP8BitReader;
+struct VP8BitReader {
+  // boolean decoder  (keep the field ordering as is!)
+  bit_t value_;               // current value
+  range_t range_;             // current range minus 1. In [127, 254] interval.
+  int bits_;                  // number of valid bits left
+  // read buffer
+  const uint8_t* buf_;        // next byte to be read
+  const uint8_t* buf_end_;    // end of read buffer
+  int eof_;                   // true if input is exhausted
+};
+
+// Initialize the bit reader and the boolean decoder.
+void VP8InitBitReader(VP8BitReader* const br,
+                      const uint8_t* const start, const uint8_t* const end);
+
+// Update internal pointers to displace the byte buffer by the
+// relative offset 'offset'.
+void VP8RemapBitReader(VP8BitReader* const br, ptrdiff_t offset);
+
+// return the next value made of 'num_bits' bits
+uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
+static WEBP_INLINE uint32_t VP8Get(VP8BitReader* const br) {
+  return VP8GetValue(br, 1);
+}
+
+// return the next value with sign-extension.
+int32_t VP8GetSignedValue(VP8BitReader* const br, int num_bits);
+
+// bit_reader_inl.h will implement the following methods:
+//   static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob)
+//   static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v)
+// and should be included by the .c files that actually need them.
+// This is to avoid recompiling the whole library whenever this file is touched,
+// and also allowing platform-specific ad-hoc hacks.
+
+// -----------------------------------------------------------------------------
+// Bitreader for lossless format
+
+// maximum number of bits (inclusive) the bit-reader can handle:
+#define VP8L_MAX_NUM_BIT_READ 24
+
+#define VP8L_LBITS 64  // Number of bits prefetched (= bit-size of vp8l_val_t).
+#define VP8L_WBITS 32  // Minimum number of bytes ready after VP8LFillBitWindow.
+
+typedef uint64_t vp8l_val_t;  // right now, this bit-reader can only use 64bit.
+
+typedef struct {
+  vp8l_val_t     val_;        // pre-fetched bits
+  const uint8_t* buf_;        // input byte buffer
+  size_t         len_;        // buffer length
+  size_t         pos_;        // byte position in buf_
+  int            bit_pos_;    // current bit-reading position in val_
+  int            eos_;        // true if a bit was read past the end of buffer
+} VP8LBitReader;
+
+void VP8LInitBitReader(VP8LBitReader* const br,
+                       const uint8_t* const start,
+                       size_t length);
+
+//  Sets a new data buffer.
+void VP8LBitReaderSetBuffer(VP8LBitReader* const br,
+                            const uint8_t* const buffer, size_t length);
+
+// Reads the specified number of bits from read buffer.
+// Flags an error in case end_of_stream or n_bits is more than the allowed limit
+// of VP8L_MAX_NUM_BIT_READ (inclusive).
+// Flags eos_ if this read attempt is going to cross the read buffer.
+uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits);
+
+// Return the prefetched bits, so they can be looked up.
+static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
+  return (uint32_t)(br->val_ >> (br->bit_pos_ & (VP8L_LBITS - 1)));
+}
+
+// Returns true if there was an attempt at reading bit past the end of
+// the buffer. Doesn't set br->eos_ flag.
+static WEBP_INLINE int VP8LIsEndOfStream(const VP8LBitReader* const br) {
+  assert(br->pos_ <= br->len_);
+  return br->eos_ || ((br->pos_ == br->len_) && (br->bit_pos_ > VP8L_LBITS));
+}
+
+// For jumping over a number of bits in the bit stream when accessed with
+// VP8LPrefetchBits and VP8LFillBitWindow.
+static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
+  br->bit_pos_ = val;
+  br->eos_ = VP8LIsEndOfStream(br);
+}
+
+// Advances the read buffer by 4 bytes to make room for reading next 32 bits.
+// Speed critical, but infrequent part of the code can be non-inlined.
+extern void VP8LDoFillBitWindow(VP8LBitReader* const br);
+static WEBP_INLINE void VP8LFillBitWindow(VP8LBitReader* const br) {
+  if (br->bit_pos_ >= VP8L_WBITS) VP8LDoFillBitWindow(br);
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_BIT_READER_H_ */
diff --git a/src/loaders/webp/utils/bit_reader_inl.h b/src/loaders/webp/utils/bit_reader_inl.h
new file mode 100644
index 00000000..1f79c941
--- /dev/null
+++ b/src/loaders/webp/utils/bit_reader_inl.h
@@ -0,0 +1,172 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Specific inlined methods for boolean decoder [VP8GetBit() ...]
+// This file should be included by the .c sources that actually need to call
+// these methods.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_BIT_READER_INL_H_
+#define WEBP_UTILS_BIT_READER_INL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#ifdef WEBP_FORCE_ALIGNED
+#include <string.h>  // memcpy
+#endif
+
+#include "../dsp/dsp.h"
+#include "./bit_reader.h"
+#include "./endian_inl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Derived type lbit_t = natural type for memory I/O
+
+#if   (BITS > 32)
+typedef uint64_t lbit_t;
+#elif (BITS > 16)
+typedef uint32_t lbit_t;
+#elif (BITS >  8)
+typedef uint16_t lbit_t;
+#else
+typedef uint8_t lbit_t;
+#endif
+
+extern const uint8_t kVP8Log2Range[128];
+extern const uint8_t kVP8NewRange[128];
+
+// special case for the tail byte-reading
+void VP8LoadFinalBytes(VP8BitReader* const br);
+
+//------------------------------------------------------------------------------
+// Inlined critical functions
+
+// makes sure br->value_ has at least BITS bits worth of data
+static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
+  assert(br != NULL && br->buf_ != NULL);
+  // Read 'BITS' bits at a time if possible.
+  if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
+    // convert memory type to register type (with some zero'ing!)
+    bit_t bits;
+#if defined(WEBP_FORCE_ALIGNED)
+    lbit_t in_bits;
+    memcpy(&in_bits, br->buf_, sizeof(in_bits));
+#elif defined(WEBP_USE_MIPS32)
+    // This is needed because of un-aligned read.
+    lbit_t in_bits;
+    lbit_t* p_buf_ = (lbit_t*)br->buf_;
+    __asm__ volatile(
+      ".set   push                             \n\t"
+      ".set   at                               \n\t"
+      ".set   macro                            \n\t"
+      "ulw    %[in_bits], 0(%[p_buf_])         \n\t"
+      ".set   pop                              \n\t"
+      : [in_bits]"=r"(in_bits)
+      : [p_buf_]"r"(p_buf_)
+      : "memory", "at"
+    );
+#else
+    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+#endif
+    br->buf_ += BITS >> 3;
+#if !defined(WORDS_BIGENDIAN)
+#if (BITS > 32)
+    bits = BSwap64(in_bits);
+    bits >>= 64 - BITS;
+#elif (BITS >= 24)
+    bits = BSwap32(in_bits);
+    bits >>= (32 - BITS);
+#elif (BITS == 16)
+    bits = BSwap16(in_bits);
+#else   // BITS == 8
+    bits = (bit_t)in_bits;
+#endif  // BITS > 32
+#else    // WORDS_BIGENDIAN
+    bits = (bit_t)in_bits;
+    if (BITS != 8 * sizeof(bit_t)) bits >>= (8 * sizeof(bit_t) - BITS);
+#endif
+    br->value_ = bits | (br->value_ << BITS);
+    br->bits_ += BITS;
+  } else {
+    VP8LoadFinalBytes(br);    // no need to be inlined
+  }
+}
+
+// Read a bit with proba 'prob'. Speed-critical function!
+static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
+  // Don't move this declaration! It makes a big speed difference to store
+  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
+  // alter br->range_ value.
+  range_t range = br->range_;
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = (range * prob) >> 8;
+    const range_t value = (range_t)(br->value_ >> pos);
+#if defined(__arm__) || defined(_M_ARM)      // ARM-specific
+    const int bit = ((int)(split - value) >> 31) & 1;
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+    } else {
+      range = split;
+    }
+#else  // faster version on x86
+    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+      bit = 1;
+    } else {
+      range = split;
+      bit = 0;
+    }
+#endif
+    if (range <= (range_t)0x7e) {
+      const int shift = kVP8Log2Range[range];
+      range = kVP8NewRange[range];
+      br->bits_ -= shift;
+    }
+    br->range_ = range;
+    return bit;
+  }
+}
+
+// simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
+static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = br->range_ >> 1;
+    const range_t value = (range_t)(br->value_ >> pos);
+    const int32_t mask = (int32_t)(split - value) >> 31;  // -1 or 0
+    br->bits_ -= 1;
+    br->range_ += mask;
+    br->range_ |= 1;
+    br->value_ -= (bit_t)((split + 1) & mask) << pos;
+    return (v ^ mask) - mask;
+  }
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif   // WEBP_UTILS_BIT_READER_INL_H_
diff --git a/src/loaders/webp/utils/color_cache.cpp b/src/loaders/webp/utils/color_cache.cpp
new file mode 100644
index 00000000..bedaa72a
--- /dev/null
+++ b/src/loaders/webp/utils/color_cache.cpp
@@ -0,0 +1,48 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Color Cache for WebP Lossless
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./color_cache.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+// VP8LColorCache.
+
+int VP8LColorCacheInit(VP8LColorCache* const cc, int hash_bits) {
+  const int hash_size = 1 << hash_bits;
+  assert(cc != NULL);
+  assert(hash_bits > 0);
+  cc->colors_ = (uint32_t*)calloc((uint64_t)hash_size, sizeof(*cc->colors_));
+  if (cc->colors_ == NULL) return 0;
+  cc->hash_shift_ = 32 - hash_bits;
+  cc->hash_bits_ = hash_bits;
+  return 1;
+}
+
+void VP8LColorCacheClear(VP8LColorCache* const cc) {
+  if (cc != NULL) {
+    free(cc->colors_);
+    cc->colors_ = NULL;
+  }
+}
+
+void VP8LColorCacheCopy(const VP8LColorCache* const src,
+                        VP8LColorCache* const dst) {
+  assert(src != NULL);
+  assert(dst != NULL);
+  assert(src->hash_bits_ == dst->hash_bits_);
+  memcpy(dst->colors_, src->colors_,
+         ((size_t)1u << dst->hash_bits_) * sizeof(*dst->colors_));
+}
diff --git a/src/loaders/webp/utils/color_cache.h b/src/loaders/webp/utils/color_cache.h
new file mode 100644
index 00000000..a1c4b3aa
--- /dev/null
+++ b/src/loaders/webp/utils/color_cache.h
@@ -0,0 +1,74 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Color Cache for WebP Lossless
+//
+// Authors: Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#ifndef WEBP_UTILS_COLOR_CACHE_H_
+#define WEBP_UTILS_COLOR_CACHE_H_
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Main color cache struct.
+typedef struct {
+  uint32_t *colors_;  // color entries
+  int hash_shift_;    // Hash shift: 32 - hash_bits_.
+  int hash_bits_;
+} VP8LColorCache;
+
+static const uint32_t kHashMul = 0x1e35a7bd;
+
+static WEBP_INLINE uint32_t VP8LColorCacheLookup(
+    const VP8LColorCache* const cc, uint32_t key) {
+  assert(key <= (~0U >> cc->hash_shift_));
+  return cc->colors_[key];
+}
+
+static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
+                                             uint32_t argb) {
+  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  cc->colors_[key] = argb;
+}
+
+static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
+                                              uint32_t argb) {
+  return (kHashMul * argb) >> cc->hash_shift_;
+}
+
+static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
+                                              uint32_t argb) {
+  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  return cc->colors_[key] == argb;
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes the color cache with 'hash_bits' bits for the keys.
+// Returns false in case of memory error.
+int VP8LColorCacheInit(VP8LColorCache* const color_cache, int hash_bits);
+
+void VP8LColorCacheCopy(const VP8LColorCache* const src,
+                        VP8LColorCache* const dst);
+
+// Delete the memory associated to color cache.
+void VP8LColorCacheClear(VP8LColorCache* const color_cache);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // WEBP_UTILS_COLOR_CACHE_H_
diff --git a/src/loaders/webp/utils/endian_inl.h b/src/loaders/webp/utils/endian_inl.h
new file mode 100644
index 00000000..cd56c37f
--- /dev/null
+++ b/src/loaders/webp/utils/endian_inl.h
@@ -0,0 +1,100 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef WEBP_UTILS_ENDIAN_INL_H_
+#define WEBP_UTILS_ENDIAN_INL_H_
+
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
+#include "../dsp/dsp.h"
+#include "../webp/types.h"
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#endif
+
+#if !defined(HAVE_CONFIG_H)
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4,3) || LOCAL_CLANG_PREREQ(3,3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4,8) || LOCAL_CLANG_PREREQ(3,3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+#endif  // !HAVE_CONFIG_H
+
+static WEBP_INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static WEBP_INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(WEBP_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static WEBP_INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // WEBP_UTILS_ENDIAN_INL_H_
diff --git a/src/loaders/webp/utils/huffman.cpp b/src/loaders/webp/utils/huffman.cpp
new file mode 100644
index 00000000..d7af79dd
--- /dev/null
+++ b/src/loaders/webp/utils/huffman.cpp
@@ -0,0 +1,204 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for building and looking up Huffman trees.
+//
+// Author: Urvang Joshi (urvang@google.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "./huffman.h"
+#include "../utils/utils.h"
+#include "../webp/format_constants.h"
+
+// Huffman data read via DecodeImageStream is represented in two (red and green)
+// bytes.
+#define MAX_HTREE_GROUPS    0x10000
+
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups) {
+  HTreeGroup* const htree_groups = (HTreeGroup*)malloc(num_htree_groups * sizeof(*htree_groups));
+  if (htree_groups == NULL) {
+    return NULL;
+  }
+  assert(num_htree_groups <= MAX_HTREE_GROUPS);
+  return htree_groups;
+}
+
+void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups) {
+  if (htree_groups != NULL) {
+    free(htree_groups);
+  }
+}
+
+// Returns reverse(reverse(key, len) + 1, len), where reverse(key, len) is the
+// bit-wise reversal of the len least significant bits of key.
+static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
+  uint32_t step = 1 << (len - 1);
+  while (key & step) {
+    step >>= 1;
+  }
+  return (key & (step - 1)) + step;
+}
+
+// Stores code in table[0], table[step], table[2*step], ..., table[end].
+// Assumes that end is an integer multiple of step.
+static WEBP_INLINE void ReplicateValue(HuffmanCode* table,
+                                       int step, int end,
+                                       HuffmanCode code) {
+  assert(end % step == 0);
+  do {
+    end -= step;
+    table[end] = code;
+  } while (end > 0);
+}
+
+// Returns the table width of the next 2nd level table. count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol
+static WEBP_INLINE int NextTableBitSize(const int* const count,
+                                        int len, int root_bits) {
+  int left = 1 << (len - root_bits);
+  while (len < MAX_ALLOWED_CODE_LENGTH) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - root_bits;
+}
+
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size) {
+  HuffmanCode* table = root_table;  // next available space in table
+  int total_size = 1 << root_bits;  // total size root table + 2nd level table
+  int* sorted = NULL;               // symbols sorted by code length
+  int len;                          // current code length
+  int symbol;                       // symbol index in original or sorted table
+  // number of codes of each length:
+  int count[MAX_ALLOWED_CODE_LENGTH + 1] = { 0 };
+  // offsets in sorted table for each length:
+  int offset[MAX_ALLOWED_CODE_LENGTH + 1];
+
+  assert(code_lengths_size != 0);
+  assert(code_lengths != NULL);
+  assert(root_table != NULL);
+  assert(root_bits > 0);
+
+  // Build histogram of code lengths.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    if (code_lengths[symbol] > MAX_ALLOWED_CODE_LENGTH) {
+      return 0;
+    }
+    ++count[code_lengths[symbol]];
+  }
+
+  // Error, all code lengths are zeros.
+  if (count[0] == code_lengths_size) {
+    return 0;
+  }
+
+  // Generate offsets into sorted symbol table by code length.
+  offset[1] = 0;
+  for (len = 1; len < MAX_ALLOWED_CODE_LENGTH; ++len) {
+    if (count[len] > (1 << len)) {
+      return 0;
+    }
+    offset[len + 1] = offset[len] + count[len];
+  }
+
+  sorted = (int*)malloc(code_lengths_size * sizeof(*sorted));
+  if (sorted == NULL) {
+    return 0;
+  }
+
+  // Sort symbols by length, by symbol order within each length.
+  for (symbol = 0; symbol < code_lengths_size; ++symbol) {
+    const int symbol_code_length = code_lengths[symbol];
+    if (code_lengths[symbol] > 0) {
+      sorted[offset[symbol_code_length]++] = symbol;
+    }
+  }
+
+  // Special case code with only one value.
+  if (offset[MAX_ALLOWED_CODE_LENGTH] == 1) {
+    HuffmanCode code;
+    code.bits = 0;
+    code.value = (uint16_t)sorted[0];
+    ReplicateValue(table, 1, total_size, code);
+    free(sorted);
+    return total_size;
+  }
+
+  {
+    int step;              // step size to replicate values in current table
+    uint32_t low = -1;     // low bits for current root entry
+    uint32_t mask = total_size - 1;    // mask for low bits
+    uint32_t key = 0;      // reversed prefix code
+    int num_nodes = 1;     // number of Huffman tree nodes
+    int num_open = 1;      // number of open branches in current tree level
+    int table_bits = root_bits;        // key length of current table
+    int table_size = 1 << table_bits;  // size of current table
+    symbol = 0;
+    // Fill in root table.
+    for (len = 1, step = 2; len <= root_bits; ++len, step <<= 1) {
+      num_open <<= 1;
+      num_nodes += num_open;
+      num_open -= count[len];
+      if (num_open < 0) {
+        free(sorted);
+        return 0;
+      }
+      for (; count[len] > 0; --count[len]) {
+        HuffmanCode code;
+        code.bits = (uint8_t)len;
+        code.value = (uint16_t)sorted[symbol++];
+        ReplicateValue(&table[key], step, table_size, code);
+        key = GetNextKey(key, len);
+      }
+    }
+
+    // Fill in 2nd level tables and add pointers to root table.
+    for (len = root_bits + 1, step = 2; len <= MAX_ALLOWED_CODE_LENGTH;
+         ++len, step <<= 1) {
+      num_open <<= 1;
+      num_nodes += num_open;
+      num_open -= count[len];
+      if (num_open < 0) {
+        free(sorted);
+        return 0;
+      }
+      for (; count[len] > 0; --count[len]) {
+        HuffmanCode code;
+        if ((key & mask) != low) {
+          table += table_size;
+          table_bits = NextTableBitSize(count, len, root_bits);
+          table_size = 1 << table_bits;
+          total_size += table_size;
+          low = key & mask;
+          root_table[low].bits = (uint8_t)(table_bits + root_bits);
+          root_table[low].value = (uint16_t)((table - root_table) - low);
+        }
+        code.bits = (uint8_t)(len - root_bits);
+        code.value = (uint16_t)sorted[symbol++];
+        ReplicateValue(&table[key >> root_bits], step, table_size, code);
+        key = GetNextKey(key, len);
+      }
+    }
+
+    // Check if tree is full.
+    if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
+      free(sorted);
+      return 0;
+    }
+  }
+
+  free(sorted);
+  return total_size;
+}
diff --git a/src/loaders/webp/utils/huffman.h b/src/loaders/webp/utils/huffman.h
new file mode 100644
index 00000000..8b34da89
--- /dev/null
+++ b/src/loaders/webp/utils/huffman.h
@@ -0,0 +1,67 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for building and looking up Huffman trees.
+//
+// Author: Urvang Joshi (urvang@google.com)
+
+#ifndef WEBP_UTILS_HUFFMAN_H_
+#define WEBP_UTILS_HUFFMAN_H_
+
+#include <assert.h>
+#include "../webp/format_constants.h"
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HUFFMAN_TABLE_BITS      8
+#define HUFFMAN_TABLE_MASK      ((1 << HUFFMAN_TABLE_BITS) - 1)
+
+#define LENGTHS_TABLE_BITS      7
+#define LENGTHS_TABLE_MASK      ((1 << LENGTHS_TABLE_BITS) - 1)
+
+
+// Huffman lookup table entry
+typedef struct {
+  uint8_t bits;     // number of bits used for this symbol
+  uint16_t value;   // symbol value or table offset
+} HuffmanCode;
+
+// Huffman table group.
+typedef struct HTreeGroup HTreeGroup;
+struct HTreeGroup {
+  HuffmanCode* htrees[HUFFMAN_CODES_PER_META_CODE];
+  int      is_trivial_literal;  // True, if huffman trees for Red, Blue & Alpha
+                                // Symbols are trivial (have a single code).
+  uint32_t literal_arb;         // If is_trivial_literal is true, this is the
+                                // ARGB value of the pixel, with Green channel
+                                // being set to zero.
+};
+
+// Creates the instance of HTreeGroup with specified number of tree-groups.
+HTreeGroup* VP8LHtreeGroupsNew(int num_htree_groups);
+
+// Releases the memory allocated for HTreeGroup.
+void VP8LHtreeGroupsFree(HTreeGroup* const htree_groups);
+
+// Builds Huffman lookup table assuming code lengths are in symbol order.
+// The 'code_lengths' is pre-allocated temporary memory buffer used for creating
+// the huffman table.
+// Returns built table size or 0 in case of error (invalid tree or
+// memory error).
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_UTILS_HUFFMAN_H_
diff --git a/src/loaders/webp/utils/meson.build b/src/loaders/webp/utils/meson.build
new file mode 100644
index 00000000..69f05f44
--- /dev/null
+++ b/src/loaders/webp/utils/meson.build
@@ -0,0 +1,22 @@
+source_file = [
+   'bit_reader.h',
+   'bit_reader_inl.h',
+   'color_cache.h',
+   'endian_inl.h',
+   'huffman.h',
+   'quant_levels_dec.h',
+   'random.h',
+   'rescaler.h',
+   'utils.h',
+   'bit_reader.cpp',
+   'color_cache.cpp',
+   'huffman.cpp',
+   'quant_levels_dec.cpp',
+   'random.cpp',
+   'rescaler.cpp'
+]
+
+webp_deb += [declare_dependency(
+    include_directories : include_directories('.'),
+    sources : source_file
+)]
\ No newline at end of file
diff --git a/src/loaders/webp/utils/quant_levels_dec.cpp b/src/loaders/webp/utils/quant_levels_dec.cpp
new file mode 100644
index 00000000..5d054e50
--- /dev/null
+++ b/src/loaders/webp/utils/quant_levels_dec.cpp
@@ -0,0 +1,279 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Implement gradient smoothing: we replace a current alpha value by its
+// surrounding average if it's close enough (that is: the change will be less
+// than the minimum distance between two quantized level).
+// We use sliding window for computing the 2d moving average.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./quant_levels_dec.h"
+#include "./utils.h"
+
+#include <stdlib.h>
+#include <string.h>   // for memset
+
+// #define USE_DITHERING   // uncomment to enable ordered dithering (not vital)
+
+#define FIX 16     // fix-point precision for averaging
+#define LFIX 2     // extra precision for look-up table
+#define LUT_SIZE ((1 << (8 + LFIX)) - 1)  // look-up table size
+
+#if defined(USE_DITHERING)
+
+#define DFIX 4           // extra precision for ordered dithering
+#define DSIZE 4          // dithering size (must be a power of two)
+// cf. http://en.wikipedia.org/wiki/Ordered_dithering
+static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
+  {  0,  8,  2, 10 },     // coefficients are in DFIX fixed-point precision
+  { 12,  4, 14,  6 },
+  {  3, 11,  1,  9 },
+  { 15,  7, 13,  5 }
+};
+
+#else
+#define DFIX 0
+#endif
+
+typedef struct {
+  int width_, height_;  // dimension
+  int row_;             // current input row being processed
+  uint8_t* src_;        // input pointer
+  uint8_t* dst_;        // output pointer
+
+  int radius_;          // filter radius (=delay)
+  int scale_;           // normalization factor, in FIX bits precision
+
+  void* mem_;           // all memory
+
+  // various scratch buffers
+  uint16_t* start_;
+  uint16_t* cur_;
+  uint16_t* end_;
+  uint16_t* top_;
+  uint16_t* average_;
+
+  // input levels distribution
+  int num_levels_;       // number of quantized levels
+  int min_, max_;        // min and max level values
+  int min_level_dist_;   // smallest distance between two consecutive levels
+
+  int16_t* correction_;  // size = 1 + 2*LUT_SIZE  -> ~4k memory
+} SmoothParams;
+
+//------------------------------------------------------------------------------
+
+#define CLIP_MASK (int)(~0U << (8 + DFIX))
+static WEBP_INLINE uint8_t clip_8b(int v) {
+  return (!(v & CLIP_MASK)) ? (uint8_t)(v >> DFIX) : (v < 0) ? 0u : 255u;
+}
+
+// vertical accumulation
+static void VFilter(SmoothParams* const p) {
+  const uint8_t* src = p->src_;
+  const int w = p->width_;
+  uint16_t* const cur = p->cur_;
+  const uint16_t* const top = p->top_;
+  uint16_t* const out = p->end_;
+  uint16_t sum = 0;               // all arithmetic is modulo 16bit
+  int x;
+
+  for (x = 0; x < w; ++x) {
+    uint16_t new_value;
+    sum += src[x];
+    new_value = top[x] + sum;
+    out[x] = new_value - cur[x];  // vertical sum of 'r' pixels.
+    cur[x] = new_value;
+  }
+  // move input pointers one row down
+  p->top_ = p->cur_;
+  p->cur_ += w;
+  if (p->cur_ == p->end_) p->cur_ = p->start_;  // roll-over
+  // We replicate edges, as it's somewhat easier as a boundary condition.
+  // That's why we don't update the 'src' pointer on top/bottom area:
+  if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
+    p->src_ += p->width_;
+  }
+}
+
+// horizontal accumulation. We use mirror replication of missing pixels, as it's
+// a little easier to implement (surprisingly).
+static void HFilter(SmoothParams* const p) {
+  const uint16_t* const in = p->end_;
+  uint16_t* const out = p->average_;
+  const uint32_t scale = p->scale_;
+  const int w = p->width_;
+  const int r = p->radius_;
+
+  int x;
+  for (x = 0; x <= r; ++x) {   // left mirroring
+    const uint16_t delta = in[x + r - 1] + in[r - x];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w - r; ++x) {     // bulk middle run
+    const uint16_t delta = in[x + r] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+  for (; x < w; ++x) {         // right mirroring
+    const uint16_t delta =
+        2 * in[w - 1] - in[2 * w - 2 - r - x] - in[x - r - 1];
+    out[x] = (delta * scale) >> FIX;
+  }
+}
+
+// emit one filtered output row
+static void ApplyFilter(SmoothParams* const p) {
+  const uint16_t* const average = p->average_;
+  const int w = p->width_;
+  const int16_t* const correction = p->correction_;
+#if defined(USE_DITHERING)
+  const uint8_t* const dither = kOrderedDither[p->row_ % DSIZE];
+#endif
+  uint8_t* const dst = p->dst_;
+  int x;
+  for (x = 0; x < w; ++x) {
+    const int v = dst[x];
+    if (v < p->max_ && v > p->min_) {
+      const int c = (v << DFIX) + correction[average[x] - (v << LFIX)];
+#if defined(USE_DITHERING)
+      dst[x] = clip_8b(c + dither[x % DSIZE]);
+#else
+      dst[x] = clip_8b(c);
+#endif
+    }
+  }
+  p->dst_ += w;  // advance output pointer
+}
+
+//------------------------------------------------------------------------------
+// Initialize correction table
+
+static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
+  // The correction curve is:
+  //   f(x) = x for x <= threshold2
+  //   f(x) = 0 for x >= threshold1
+  // and a linear interpolation for range x=[threshold2, threshold1]
+  // (along with f(-x) = -f(x) symmetry).
+  // Note that: threshold2 = 3/4 * threshold1
+  const int threshold1 = min_dist << LFIX;
+  const int threshold2 = (3 * threshold1) >> 2;
+  const int max_threshold = threshold2 << DFIX;
+  const int delta = threshold1 - threshold2;
+  int i;
+  for (i = 1; i <= LUT_SIZE; ++i) {
+    int c = (i <= threshold2) ? (i << DFIX)
+          : (i < threshold1) ? max_threshold * (threshold1 - i) / delta
+          : 0;
+    c >>= LFIX;
+    lut[+i] = +c;
+    lut[-i] = -c;
+  }
+  lut[0] = 0;
+}
+
+static void CountLevels(const uint8_t* const data, int size,
+                        SmoothParams* const p) {
+  int i, last_level;
+  uint8_t used_levels[256] = { 0 };
+  p->min_ = 255;
+  p->max_ = 0;
+  for (i = 0; i < size; ++i) {
+    const int v = data[i];
+    if (v < p->min_) p->min_ = v;
+    if (v > p->max_) p->max_ = v;
+    used_levels[v] = 1;
+  }
+  // Compute the mininum distance between two non-zero levels.
+  p->min_level_dist_ = p->max_ - p->min_;
+  last_level = -1;
+  for (i = 0; i < 256; ++i) {
+    if (used_levels[i]) {
+      ++p->num_levels_;
+      if (last_level >= 0) {
+        const int level_dist = i - last_level;
+        if (level_dist < p->min_level_dist_) {
+          p->min_level_dist_ = level_dist;
+        }
+      }
+      last_level = i;
+    }
+  }
+}
+
+// Initialize all params.
+static int InitParams(uint8_t* const data, int width, int height,
+                      int radius, SmoothParams* const p) {
+  const int R = 2 * radius + 1;  // total size of the kernel
+
+  const size_t size_scratch_m = (R + 1) * width * sizeof(*p->start_);
+  const size_t size_m =  width * sizeof(*p->average_);
+  const size_t size_lut = (1 + 2 * LUT_SIZE) * sizeof(*p->correction_);
+  const size_t total_size = size_scratch_m + size_m + size_lut;
+  uint8_t* mem = (uint8_t*)malloc(1U * total_size);
+
+  if (mem == NULL) return 0;
+  p->mem_ = (void*)mem;
+
+  p->start_ = (uint16_t*)mem;
+  p->cur_ = p->start_;
+  p->end_ = p->start_ + R * width;
+  p->top_ = p->end_ - width;
+  memset(p->top_, 0, width * sizeof(*p->top_));
+  mem += size_scratch_m;
+
+  p->average_ = (uint16_t*)mem;
+  mem += size_m;
+
+  p->width_ = width;
+  p->height_ = height;
+  p->src_ = data;
+  p->dst_ = data;
+  p->radius_ = radius;
+  p->scale_ = (1 << (FIX + LFIX)) / (R * R);  // normalization constant
+  p->row_ = -radius;
+
+  // analyze the input distribution so we can best-fit the threshold
+  CountLevels(data, width * height, p);
+
+  // correction table
+  p->correction_ = ((int16_t*)mem) + LUT_SIZE;
+  InitCorrectionLUT(p->correction_, p->min_level_dist_);
+
+  return 1;
+}
+
+static void CleanupParams(SmoothParams* const p) {
+  free(p->mem_);
+}
+
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength) {
+  const int radius = 4 * strength / 100;
+  if (strength < 0 || strength > 100) return 0;
+  if (data == NULL || width <= 0 || height <= 0) return 0;  // bad params
+  if (radius > 0) {
+    SmoothParams p;
+    memset(&p, 0, sizeof(p));
+    if (!InitParams(data, width, height, radius, &p)) return 0;
+    if (p.num_levels_ > 2) {
+      for (; p.row_ < p.height_; ++p.row_) {
+        VFilter(&p);  // accumulate average of input
+        // Need to wait few rows in order to prime the filter,
+        // before emitting some output.
+        if (p.row_ >= p.radius_) {
+          HFilter(&p);
+          ApplyFilter(&p);
+        }
+      }
+    }
+    CleanupParams(&p);
+  }
+  return 1;
+}
diff --git a/src/loaders/webp/utils/quant_levels_dec.h b/src/loaders/webp/utils/quant_levels_dec.h
new file mode 100644
index 00000000..9aab0680
--- /dev/null
+++ b/src/loaders/webp/utils/quant_levels_dec.h
@@ -0,0 +1,35 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha plane de-quantization utility
+//
+// Author:  Vikas Arora (vikasa@google.com)
+
+#ifndef WEBP_UTILS_QUANT_LEVELS_DEC_H_
+#define WEBP_UTILS_QUANT_LEVELS_DEC_H_
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Apply post-processing to input 'data' of size 'width'x'height' assuming that
+// the source was quantized to a reduced number of levels.
+// Strength is in [0..100] and controls the amount of dithering applied.
+// Returns false in case of error (data is NULL, invalid parameters,
+// malloc failure, ...).
+int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+                         int strength);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_QUANT_LEVELS_DEC_H_ */
diff --git a/src/loaders/webp/utils/random.cpp b/src/loaders/webp/utils/random.cpp
new file mode 100644
index 00000000..24e96ad6
--- /dev/null
+++ b/src/loaders/webp/utils/random.cpp
@@ -0,0 +1,43 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Pseudo-random utilities
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+#include "./random.h"
+
+//------------------------------------------------------------------------------
+
+// 31b-range values
+static const uint32_t kRandomTable[VP8_RANDOM_TABLE_SIZE] = {
+  0x0de15230, 0x03b31886, 0x775faccb, 0x1c88626a, 0x68385c55, 0x14b3b828,
+  0x4a85fef8, 0x49ddb84b, 0x64fcf397, 0x5c550289, 0x4a290000, 0x0d7ec1da,
+  0x5940b7ab, 0x5492577d, 0x4e19ca72, 0x38d38c69, 0x0c01ee65, 0x32a1755f,
+  0x5437f652, 0x5abb2c32, 0x0faa57b1, 0x73f533e7, 0x685feeda, 0x7563cce2,
+  0x6e990e83, 0x4730a7ed, 0x4fc0d9c6, 0x496b153c, 0x4f1403fa, 0x541afb0c,
+  0x73990b32, 0x26d7cb1c, 0x6fcc3706, 0x2cbb77d8, 0x75762f2a, 0x6425ccdd,
+  0x24b35461, 0x0a7d8715, 0x220414a8, 0x141ebf67, 0x56b41583, 0x73e502e3,
+  0x44cab16f, 0x28264d42, 0x73baaefb, 0x0a50ebed, 0x1d6ab6fb, 0x0d3ad40b,
+  0x35db3b68, 0x2b081e83, 0x77ce6b95, 0x5181e5f0, 0x78853bbc, 0x009f9494,
+  0x27e5ed3c
+};
+
+void VP8InitRandom(VP8Random* const rg, float dithering) {
+  memcpy(rg->tab_, kRandomTable, sizeof(rg->tab_));
+  rg->index1_ = 0;
+  rg->index2_ = 31;
+  rg->amp_ = (dithering < 0.0) ? 0
+           : (dithering > 1.0) ? (1 << VP8_RANDOM_DITHER_FIX)
+           : (uint32_t)((1 << VP8_RANDOM_DITHER_FIX) * dithering);
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/src/loaders/webp/utils/random.h b/src/loaders/webp/utils/random.h
new file mode 100644
index 00000000..c392a615
--- /dev/null
+++ b/src/loaders/webp/utils/random.h
@@ -0,0 +1,63 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Pseudo-random utilities
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_RANDOM_H_
+#define WEBP_UTILS_RANDOM_H_
+
+#include <assert.h>
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VP8_RANDOM_DITHER_FIX 8   // fixed-point precision for dithering
+#define VP8_RANDOM_TABLE_SIZE 55
+
+typedef struct {
+  int index1_, index2_;
+  uint32_t tab_[VP8_RANDOM_TABLE_SIZE];
+  int amp_;
+} VP8Random;
+
+// Initializes random generator with an amplitude 'dithering' in range [0..1].
+void VP8InitRandom(VP8Random* const rg, float dithering);
+
+// Returns a centered pseudo-random number with 'num_bits' amplitude.
+// (uses D.Knuth's Difference-based random generator).
+// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
+static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
+                                      int amp) {
+  int diff;
+  assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
+  diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
+  if (diff < 0) diff += (1u << 31);
+  rg->tab_[rg->index1_] = diff;
+  if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
+  if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
+  // sign-extend, 0-center
+  diff = (int)((uint32_t)diff << 1) >> (32 - num_bits);
+  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
+  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
+  return diff;
+}
+
+static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+  return VP8RandomBits2(rg, num_bits, rg->amp_);
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_RANDOM_H_ */
diff --git a/src/loaders/webp/utils/rescaler.cpp b/src/loaders/webp/utils/rescaler.cpp
new file mode 100644
index 00000000..cd45ae8d
--- /dev/null
+++ b/src/loaders/webp/utils/rescaler.cpp
@@ -0,0 +1,82 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include "../dsp/dsp.h"
+#include "./rescaler.h"
+
+//------------------------------------------------------------------------------
+
+void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+                      uint8_t* const dst, int dst_width, int dst_height,
+                      int dst_stride, int num_channels, int x_add, int x_sub,
+                      int y_add, int y_sub, int32_t* const work) {
+  wrk->x_expand = (src_width < dst_width);
+  wrk->src_width = src_width;
+  wrk->src_height = src_height;
+  wrk->dst_width = dst_width;
+  wrk->dst_height = dst_height;
+  wrk->dst = dst;
+  wrk->dst_stride = dst_stride;
+  wrk->num_channels = num_channels;
+  // for 'x_expand', we use bilinear interpolation
+  wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub;
+  wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub;
+  wrk->y_accum = y_add;
+  wrk->y_add = y_add;
+  wrk->y_sub = y_sub;
+  wrk->fx_scale = (1 << WEBP_RESCALER_RFIX) / x_sub;
+  wrk->fy_scale = (1 << WEBP_RESCALER_RFIX) / y_sub;
+  wrk->fxy_scale = wrk->x_expand ?
+      ((int64_t)dst_height << WEBP_RESCALER_RFIX) / (x_sub * src_height) :
+      ((int64_t)dst_height << WEBP_RESCALER_RFIX) / (x_add * src_height);
+  wrk->irow = work;
+  wrk->frow = work + num_channels * dst_width;
+
+  WebPRescalerDspInit();
+}
+
+//------------------------------------------------------------------------------
+// all-in-one calls
+
+int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
+  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+  return (num_lines > max_num_lines) ? max_num_lines : num_lines;
+}
+
+int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
+                       const uint8_t* src, int src_stride) {
+  int total_imported = 0;
+  while (total_imported < num_lines && wrk->y_accum > 0) {
+    int channel;
+    for (channel = 0; channel < wrk->num_channels; ++channel) {
+      WebPRescalerImportRow(wrk, src, channel);
+    }
+    src += src_stride;
+    ++total_imported;
+    wrk->y_accum -= wrk->y_sub;
+  }
+  return total_imported;
+}
+
+int WebPRescalerExport(WebPRescaler* const rescaler) {
+  int total_exported = 0;
+  while (WebPRescalerHasPendingOutput(rescaler)) {
+    WebPRescalerExportRow(rescaler, 0);
+    ++total_exported;
+  }
+  return total_exported;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/loaders/webp/utils/rescaler.h b/src/loaders/webp/utils/rescaler.h
new file mode 100644
index 00000000..dd138b1d
--- /dev/null
+++ b/src/loaders/webp/utils/rescaler.h
@@ -0,0 +1,78 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_RESCALER_H_
+#define WEBP_UTILS_RESCALER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../webp/types.h"
+
+#define WEBP_RESCALER_RFIX 30   // fixed-point precision for multiplies
+
+// Structure used for on-the-fly rescaling
+typedef struct WebPRescaler WebPRescaler;
+struct WebPRescaler {
+  int x_expand;               // true if we're expanding in the x direction
+  int num_channels;           // bytes to jump between pixels
+  int fy_scale, fx_scale;     // fixed-point scaling factor
+  int64_t fxy_scale;          // ''
+  // we need hpel-precise add/sub increments, for the downsampled U/V planes.
+  int y_accum;                // vertical accumulator
+  int y_add, y_sub;           // vertical increments (add ~= src, sub ~= dst)
+  int x_add, x_sub;           // horizontal increments (add ~= src, sub ~= dst)
+  int src_width, src_height;  // source dimensions
+  int dst_width, dst_height;  // destination dimensions
+  uint8_t* dst;
+  int dst_stride;
+  int32_t* irow, *frow;       // work buffer
+};
+
+// Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
+void WebPRescalerInit(WebPRescaler* const rescaler,
+                      int src_width, int src_height,
+                      uint8_t* const dst,
+                      int dst_width, int dst_height, int dst_stride,
+                      int num_channels,
+                      int x_add, int x_sub,
+                      int y_add, int y_sub,
+                      int32_t* const work);
+
+// Returns the number of input lines needed next to produce one output line,
+// considering that the maximum available input lines are 'max_num_lines'.
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines);
+
+// Import multiple rows over all channels, until at least one row is ready to
+// be exported. Returns the actual number of lines that were imported.
+int WebPRescalerImport(WebPRescaler* const rescaler, int num_rows,
+                       const uint8_t* src, int src_stride);
+
+// Return true if there is pending output rows ready.
+static WEBP_INLINE
+int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
+  return (rescaler->y_accum <= 0);
+}
+
+// Export as many rows as possible. Return the numbers of rows written.
+int WebPRescalerExport(WebPRescaler* const rescaler);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_RESCALER_H_ */
diff --git a/src/loaders/webp/utils/utils.h b/src/loaders/webp/utils/utils.h
new file mode 100644
index 00000000..3dba9427
--- /dev/null
+++ b/src/loaders/webp/utils/utils.h
@@ -0,0 +1,68 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Misc. common utility functions
+//
+// Authors: Skal (pascal.massimino@gmail.com)
+//          Urvang (urvang@google.com)
+
+#ifndef WEBP_UTILS_UTILS_H_
+#define WEBP_UTILS_UTILS_H_
+
+#include <assert.h>
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Returns (int)floor(log2(n)). n must be > 0.
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  uint32_t first_set_bit;
+  _BitScanReverse((unsigned long*)&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_UTILS_H_ */
diff --git a/src/loaders/webp/webp/decode.h b/src/loaders/webp/webp/decode.h
new file mode 100644
index 00000000..e0dd5703
--- /dev/null
+++ b/src/loaders/webp/webp/decode.h
@@ -0,0 +1,493 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Main decoding functions for WebP images.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_DECODE_H_
+#define WEBP_WEBP_DECODE_H_
+
+#include "./types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WEBP_DECODER_ABI_VERSION 0x0205    // MAJOR(8b) + MINOR(8b)
+
+// Note: forward declaring enumerations is not allowed in (strict) C and C++,
+// the types are left here for reference.
+// typedef enum VP8StatusCode VP8StatusCode;
+// typedef enum WEBP_CSP_MODE WEBP_CSP_MODE;
+typedef struct WebPRGBABuffer WebPRGBABuffer;
+typedef struct WebPYUVABuffer WebPYUVABuffer;
+typedef struct WebPDecBuffer WebPDecBuffer;
+typedef struct WebPIDecoder WebPIDecoder;
+typedef struct WebPBitstreamFeatures WebPBitstreamFeatures;
+typedef struct WebPDecoderOptions WebPDecoderOptions;
+typedef struct WebPDecoderConfig WebPDecoderConfig;
+
+// Return the decoder's version number, packed in hexadecimal using 8bits for
+// each of major/minor/revision. E.g: v2.5.7 is 0x020507.
+WEBP_EXTERN(int) WebPGetDecoderVersion(void);
+
+// Retrieve basic header information: width, height.
+// This function will also validate the header and return 0 in
+// case of formatting error.
+// Pointers 'width' and 'height' can be passed NULL if deemed irrelevant.
+WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, size_t data_size,
+                             int* width, int* height);
+
+// Decodes WebP images pointed to by 'data' and returns RGBA samples, along
+// with the dimensions in *width and *height. The ordering of samples in
+// memory is R, G, B, A, R, G, B, A... in scan order (endian-independent).
+// The returned pointer should be deleted calling free().
+// Returns NULL in case of error.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning A, R, G, B, A, R, G, B... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning B, G, R, A, B, G, R, A... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, size_t data_size,
+                                     int* width, int* height);
+
+// Same as WebPDecodeRGBA, but returning R, G, B, R, G, B... ordered data.
+// If the bitstream contains transparency, it is ignored.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
+
+// Same as WebPDecodeRGB, but returning B, G, R, B, G, R... ordered data.
+WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, size_t data_size,
+                                    int* width, int* height);
+
+
+// Decode WebP images pointed to by 'data' to Y'UV format(*). The pointer
+// returned is the Y samples buffer. Upon return, *u and *v will point to
+// the U and V chroma data. These U and V buffers need NOT be free()'d,
+// unlike the returned Y luma one. The dimension of the U and V planes
+// are both (*width + 1) / 2 and (*height + 1)/ 2.
+// Upon return, the Y buffer has a stride returned as '*stride', while U and V
+// have a common stride returned as '*uv_stride'.
+// Return NULL in case of error.
+// (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr
+WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, size_t data_size,
+                                    int* width, int* height,
+                                    uint8_t** u, uint8_t** v,
+                                    int* stride, int* uv_stride);
+
+// These five functions are variants of the above ones, that decode the image
+// directly into a pre-allocated buffer 'output_buffer'. The maximum storage
+// available in this buffer is indicated by 'output_buffer_size'. If this
+// storage is not sufficient (or an error occurred), NULL is returned.
+// Otherwise, output_buffer is returned, for convenience.
+// The parameter 'output_stride' specifies the distance (in bytes)
+// between scanlines. Hence, output_buffer_size is expected to be at least
+// output_stride x picture-height.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+
+// RGB and BGR variants. Here too the transparency information, if present,
+// will be dropped and ignored.
+WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+
+// WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly
+// into pre-allocated luma/chroma plane buffers. This function requires the
+// strides to be passed: one for the luma plane and one for each of the
+// chroma ones. The size of each plane buffer is passed as 'luma_size',
+// 'u_size' and 'v_size' respectively.
+// Pointer to the luma plane ('*luma') is returned or NULL if an error occurred
+// during decoding (or because some buffers were found to be too small).
+WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto(
+    const uint8_t* data, size_t data_size,
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride);
+
+//------------------------------------------------------------------------------
+// Output colorspaces and buffer
+
+// Colorspaces
+// Note: the naming describes the byte-ordering of packed samples in memory.
+// For instance, MODE_BGRA relates to samples ordered as B,G,R,A,B,G,R,A,...
+// Non-capital names (e.g.:MODE_Argb) relates to pre-multiplied RGB channels.
+// RGBA-4444 and RGB-565 colorspaces are represented by following byte-order:
+// RGBA-4444: [r3 r2 r1 r0 g3 g2 g1 g0], [b3 b2 b1 b0 a3 a2 a1 a0], ...
+// RGB-565: [r4 r3 r2 r1 r0 g5 g4 g3], [g2 g1 g0 b4 b3 b2 b1 b0], ...
+// In the case WEBP_SWAP_16BITS_CSP is defined, the bytes are swapped for
+// these two modes:
+// RGBA-4444: [b3 b2 b1 b0 a3 a2 a1 a0], [r3 r2 r1 r0 g3 g2 g1 g0], ...
+// RGB-565: [g2 g1 g0 b4 b3 b2 b1 b0], [r4 r3 r2 r1 r0 g5 g4 g3], ...
+
+typedef enum WEBP_CSP_MODE {
+  MODE_RGB = 0, MODE_RGBA = 1,
+  MODE_BGR = 2, MODE_BGRA = 3,
+  MODE_ARGB = 4, MODE_RGBA_4444 = 5,
+  MODE_RGB_565 = 6,
+  // RGB-premultiplied transparent modes (alpha value is preserved)
+  MODE_rgbA = 7,
+  MODE_bgrA = 8,
+  MODE_Argb = 9,
+  MODE_rgbA_4444 = 10,
+  // YUV modes must come after RGB ones.
+  MODE_YUV = 11, MODE_YUVA = 12,  // yuv 4:2:0
+  MODE_LAST = 13
+} WEBP_CSP_MODE;
+
+// Some useful macros:
+static WEBP_INLINE int WebPIsPremultipliedMode(WEBP_CSP_MODE mode) {
+  return (mode == MODE_rgbA || mode == MODE_bgrA || mode == MODE_Argb ||
+          mode == MODE_rgbA_4444);
+}
+
+static WEBP_INLINE int WebPIsAlphaMode(WEBP_CSP_MODE mode) {
+  return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB ||
+          mode == MODE_RGBA_4444 || mode == MODE_YUVA ||
+          WebPIsPremultipliedMode(mode));
+}
+
+static WEBP_INLINE int WebPIsRGBMode(WEBP_CSP_MODE mode) {
+  return (mode < MODE_YUV);
+}
+
+//------------------------------------------------------------------------------
+// WebPDecBuffer: Generic structure for describing the output sample buffer.
+
+struct WebPRGBABuffer {    // view as RGBA
+  uint8_t* rgba;    // pointer to RGBA samples
+  int stride;       // stride in bytes from one scanline to the next.
+  size_t size;      // total size of the *rgba buffer.
+};
+
+struct WebPYUVABuffer {              // view as YUVA
+  uint8_t* y, *u, *v, *a;     // pointer to luma, chroma U/V, alpha samples
+  int y_stride;               // luma stride
+  int u_stride, v_stride;     // chroma strides
+  int a_stride;               // alpha stride
+  size_t y_size;              // luma plane size
+  size_t u_size, v_size;      // chroma planes size
+  size_t a_size;              // alpha-plane size
+};
+
+// Output buffer
+struct WebPDecBuffer {
+  WEBP_CSP_MODE colorspace;  // Colorspace.
+  int width, height;         // Dimensions.
+  int is_external_memory;    // If true, 'internal_memory' pointer is not used.
+  union {
+    WebPRGBABuffer RGBA;
+    WebPYUVABuffer YUVA;
+  } u;                       // Nameless union of buffer parameters.
+  uint32_t       pad[4];     // padding for later use
+
+  uint8_t* private_memory;   // Internally allocated memory (only when
+                             // is_external_memory is false). Should not be used
+                             // externally, but accessed via the buffer union.
+};
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer*, int);
+
+// Initialize the structure as empty. Must be called before any other use.
+// Returns false in case of version mismatch
+static WEBP_INLINE int WebPInitDecBuffer(WebPDecBuffer* buffer) {
+  return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION);
+}
+
+// Free any memory associated with the buffer. Must always be called last.
+// Note: doesn't free the 'buffer' structure itself.
+WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* buffer);
+
+//------------------------------------------------------------------------------
+// Enumeration of the status codes
+
+typedef enum VP8StatusCode {
+  VP8_STATUS_OK = 0,
+  VP8_STATUS_OUT_OF_MEMORY,
+  VP8_STATUS_INVALID_PARAM,
+  VP8_STATUS_BITSTREAM_ERROR,
+  VP8_STATUS_UNSUPPORTED_FEATURE,
+  VP8_STATUS_SUSPENDED,
+  VP8_STATUS_USER_ABORT,
+  VP8_STATUS_NOT_ENOUGH_DATA
+} VP8StatusCode;
+
+//------------------------------------------------------------------------------
+// Incremental decoding
+//
+// This API allows streamlined decoding of partial data.
+// Picture can be incrementally decoded as data become available thanks to the
+// WebPIDecoder object. This object can be left in a SUSPENDED state if the
+// picture is only partially decoded, pending additional input.
+// Code example:
+//
+//   WebPInitDecBuffer(&buffer);
+//   buffer.colorspace = mode;
+//   ...
+//   WebPIDecoder* idec = WebPINewDecoder(&buffer);
+//   while (has_more_data) {
+//     // ... (get additional data)
+//     status = WebPIAppend(idec, new_data, new_data_size);
+//     if (status != VP8_STATUS_SUSPENDED ||
+//       break;
+//     }
+//
+//     // The above call decodes the current available buffer.
+//     // Part of the image can now be refreshed by calling to
+//     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
+//   }
+//   WebPIDelete(idec);
+
+// Creates a new incremental decoder with the supplied buffer parameter.
+// This output_buffer can be passed NULL, in which case a default output buffer
+// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer'
+// is kept, which means that the lifespan of 'output_buffer' must be larger than
+// that of the returned WebPIDecoder object.
+// The supplied 'output_buffer' content MUST NOT be changed between calls to
+// WebPIAppend() or WebPIUpdate() unless 'output_buffer.is_external_memory' is
+// set to 1. In such a case, it is allowed to modify the pointers, size and
+// stride of output_buffer.u.RGBA or output_buffer.u.YUVA, provided they remain
+// within valid bounds.
+// All other fields of WebPDecBuffer MUST remain constant between calls.
+// Returns NULL if the allocation failed.
+WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* output_buffer);
+
+// This function allocates and initializes an incremental-decoder object, which
+// will output the RGB/A samples specified by 'csp' into a preallocated
+// buffer 'output_buffer'. The size of this buffer is at least
+// 'output_buffer_size' and the stride (distance in bytes between two scanlines)
+// is specified by 'output_stride'.
+// Additionally, output_buffer can be passed NULL in which case the output
+// buffer will be allocated automatically when the decoding starts. The
+// colorspace 'csp' is taken into account for allocating this buffer. All other
+// parameters are ignored.
+// Returns NULL if the allocation failed, or if some parameters are invalid.
+WEBP_EXTERN(WebPIDecoder*) WebPINewRGB(
+    WEBP_CSP_MODE csp,
+    uint8_t* output_buffer, size_t output_buffer_size, int output_stride);
+
+// This function allocates and initializes an incremental-decoder object, which
+// will output the raw luma/chroma samples into a preallocated planes if
+// supplied. The luma plane is specified by its pointer 'luma', its size
+// 'luma_size' and its stride 'luma_stride'. Similarly, the chroma-u plane
+// is specified by the 'u', 'u_size' and 'u_stride' parameters, and the chroma-v
+// plane by 'v' and 'v_size'. And same for the alpha-plane. The 'a' pointer
+// can be pass NULL in case one is not interested in the transparency plane.
+// Conversely, 'luma' can be passed NULL if no preallocated planes are supplied.
+// In this case, the output buffer will be automatically allocated (using
+// MODE_YUVA) when decoding starts. All parameters are then ignored.
+// Returns NULL if the allocation failed or if a parameter is invalid.
+WEBP_EXTERN(WebPIDecoder*) WebPINewYUVA(
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride,
+    uint8_t* a, size_t a_size, int a_stride);
+
+// Deprecated version of the above, without the alpha plane.
+// Kept for backward compatibility.
+WEBP_EXTERN(WebPIDecoder*) WebPINewYUV(
+    uint8_t* luma, size_t luma_size, int luma_stride,
+    uint8_t* u, size_t u_size, int u_stride,
+    uint8_t* v, size_t v_size, int v_stride);
+
+// Deletes the WebPIDecoder object and associated memory. Must always be called
+// if WebPINewDecoder, WebPINewRGB or WebPINewYUV succeeded.
+WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* idec);
+
+// Copies and decodes the next available data. Returns VP8_STATUS_OK when
+// the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more
+// data is expected. Returns error in other cases.
+WEBP_EXTERN(VP8StatusCode) WebPIAppend(
+    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
+
+// A variant of the above function to be used when data buffer contains
+// partial data from the beginning. In this case data buffer is not copied
+// to the internal memory.
+// Note that the value of the 'data' pointer can change between calls to
+// WebPIUpdate, for instance when the data buffer is resized to fit larger data.
+WEBP_EXTERN(VP8StatusCode) WebPIUpdate(
+    WebPIDecoder* idec, const uint8_t* data, size_t data_size);
+
+// Returns the RGB/A image decoded so far. Returns NULL if output params
+// are not initialized yet. The RGB/A output type corresponds to the colorspace
+// specified during call to WebPINewDecoder() or WebPINewRGB().
+// *last_y is the index of last decoded row in raster scan order. Some pointers
+// (*last_y, *width etc.) can be NULL if corresponding information is not
+// needed.
+WEBP_EXTERN(uint8_t*) WebPIDecGetRGB(
+    const WebPIDecoder* idec, int* last_y,
+    int* width, int* height, int* stride);
+
+// Same as above function to get a YUVA image. Returns pointer to the luma
+// plane or NULL in case of error. If there is no alpha information
+// the alpha pointer '*a' will be returned NULL.
+WEBP_EXTERN(uint8_t*) WebPIDecGetYUVA(
+    const WebPIDecoder* idec, int* last_y,
+    uint8_t** u, uint8_t** v, uint8_t** a,
+    int* width, int* height, int* stride, int* uv_stride, int* a_stride);
+
+// Deprecated alpha-less version of WebPIDecGetYUVA(): it will ignore the
+// alpha information (if present). Kept for backward compatibility.
+static WEBP_INLINE uint8_t* WebPIDecGetYUV(
+    const WebPIDecoder* idec, int* last_y, uint8_t** u, uint8_t** v,
+    int* width, int* height, int* stride, int* uv_stride) {
+  return WebPIDecGetYUVA(idec, last_y, u, v, NULL, width, height,
+                         stride, uv_stride, NULL);
+}
+
+// Generic call to retrieve information about the displayable area.
+// If non NULL, the left/right/width/height pointers are filled with the visible
+// rectangular area so far.
+// Returns NULL in case the incremental decoder object is in an invalid state.
+// Otherwise returns the pointer to the internal representation. This structure
+// is read-only, tied to WebPIDecoder's lifespan and should not be modified.
+WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea(
+    const WebPIDecoder* idec, int* left, int* top, int* width, int* height);
+
+//------------------------------------------------------------------------------
+// Advanced decoding parametrization
+//
+//  Code sample for using the advanced decoding API
+/*
+     // A) Init a configuration object
+     WebPDecoderConfig config;
+     CHECK(WebPInitDecoderConfig(&config));
+
+     // B) optional: retrieve the bitstream's features.
+     CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK);
+
+     // C) Adjust 'config', if needed
+     config.no_fancy_upsampling = 1;
+     config.output.colorspace = MODE_BGRA;
+     // etc.
+
+     // Note that you can also make config.output point to an externally
+     // supplied memory buffer, provided it's big enough to store the decoded
+     // picture. Otherwise, config.output will just be used to allocate memory
+     // and store the decoded picture.
+
+     // D) Decode!
+     CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK);
+
+     // E) Decoded image is now in config.output (and config.output.u.RGBA)
+
+     // F) Reclaim memory allocated in config's object. It's safe to call
+     // this function even if the memory is external and wasn't allocated
+     // by WebPDecode().
+     WebPFreeDecBuffer(&config.output);
+*/
+
+// Features gathered from the bitstream
+struct WebPBitstreamFeatures {
+  int width;          // Width in pixels, as read from the bitstream.
+  int height;         // Height in pixels, as read from the bitstream.
+  int has_alpha;      // True if the bitstream contains an alpha channel.
+  int has_animation;  // True if the bitstream is an animation.
+  int format;         // 0 = undefined (/mixed), 1 = lossy, 2 = lossless
+
+  // Unused for now:
+  int no_incremental_decoding;  // if true, using incremental decoding is not
+                                // recommended.
+  int rotate;                   // TODO(later)
+  int uv_sampling;              // should be 0 for now. TODO(later)
+  uint32_t pad[2];              // padding for later use
+};
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal(
+    const uint8_t*, size_t, WebPBitstreamFeatures*, int);
+
+// Retrieve features from the bitstream. The *features structure is filled
+// with information gathered from the bitstream.
+// Returns VP8_STATUS_OK when the features are successfully retrieved. Returns
+// VP8_STATUS_NOT_ENOUGH_DATA when more data is needed to retrieve the
+// features from headers. Returns error in other cases.
+static WEBP_INLINE VP8StatusCode WebPGetFeatures(
+    const uint8_t* data, size_t data_size,
+    WebPBitstreamFeatures* features) {
+  return WebPGetFeaturesInternal(data, data_size, features,
+                                 WEBP_DECODER_ABI_VERSION);
+}
+
+// Decoding options
+struct WebPDecoderOptions {
+  int bypass_filtering;               // if true, skip the in-loop filtering
+  int no_fancy_upsampling;            // if true, use faster pointwise upsampler
+  int use_cropping;                   // if true, cropping is applied _first_
+  int crop_left, crop_top;            // top-left position for cropping.
+                                      // Will be snapped to even values.
+  int crop_width, crop_height;        // dimension of the cropping area
+  int use_scaling;                    // if true, scaling is applied _afterward_
+  int scaled_width, scaled_height;    // final resolution
+  int use_threads;                    // if true, use multi-threaded decoding
+  int dithering_strength;             // dithering strength (0=Off, 100=full)
+  int flip;                           // flip output vertically
+  int alpha_dithering_strength;       // alpha dithering strength in [0..100]
+
+  // Unused for now:
+  int force_rotation;                 // forced rotation (to be applied _last_)
+  int no_enhancement;                 // if true, discard enhancement layer
+  uint32_t pad[3];                    // padding for later use
+};
+
+// Main object storing the configuration for advanced decoding.
+struct WebPDecoderConfig {
+  WebPBitstreamFeatures input;  // Immutable bitstream features (optional)
+  WebPDecBuffer output;         // Output buffer (can point to external mem)
+  WebPDecoderOptions options;   // Decoding options
+};
+
+// Internal, version-checked, entry point
+WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig*, int);
+
+// Initialize the configuration as empty. This function must always be
+// called first, unless WebPGetFeatures() is to be called.
+// Returns false in case of mismatched version.
+static WEBP_INLINE int WebPInitDecoderConfig(WebPDecoderConfig* config) {
+  return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION);
+}
+
+// Instantiate a new incremental decoder object with the requested
+// configuration. The bitstream can be passed using 'data' and 'data_size'
+// parameter, in which case the features will be parsed and stored into
+// config->input. Otherwise, 'data' can be NULL and no parsing will occur.
+// Note that 'config' can be NULL too, in which case a default configuration
+// is used.
+// The return WebPIDecoder object must always be deleted calling WebPIDelete().
+// Returns NULL in case of error (and config->status will then reflect
+// the error condition).
+WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
+                                       WebPDecoderConfig* config);
+
+// Non-incremental version. This version decodes the full data at once, taking
+// 'config' into account. Returns decoding status (which should be VP8_STATUS_OK
+// if the decoding was successful).
+WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
+                                      WebPDecoderConfig* config);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_WEBP_DECODE_H_ */
diff --git a/src/loaders/webp/webp/format_constants.h b/src/loaders/webp/webp/format_constants.h
new file mode 100644
index 00000000..4c04b50c
--- /dev/null
+++ b/src/loaders/webp/webp/format_constants.h
@@ -0,0 +1,88 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Internal header for constants related to WebP file format.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_WEBP_FORMAT_CONSTANTS_H_
+#define WEBP_WEBP_FORMAT_CONSTANTS_H_
+
+// Create fourcc of the chunk from the chunk tag characters.
+#define MKFOURCC(a, b, c, d) ((uint32_t)(a) | (b) << 8 | (c) << 16 | (d) << 24)
+
+// VP8 related constants.
+#define VP8_SIGNATURE 0x9d012a              // Signature in VP8 data.
+#define VP8_MAX_PARTITION0_SIZE (1 << 19)   // max size of mode partition
+#define VP8_MAX_PARTITION_SIZE  (1 << 24)   // max size for token partition
+#define VP8_FRAME_HEADER_SIZE 10  // Size of the frame header within VP8 data.
+
+// VP8L related constants.
+#define VP8L_SIGNATURE_SIZE          1      // VP8L signature size.
+#define VP8L_MAGIC_BYTE              0x2f   // VP8L signature byte.
+#define VP8L_IMAGE_SIZE_BITS         14     // Number of bits used to store
+                                            // width and height.
+#define VP8L_VERSION_BITS            3      // 3 bits reserved for version.
+#define VP8L_VERSION                 0      // version 0
+#define VP8L_FRAME_HEADER_SIZE       5      // Size of the VP8L frame header.
+
+#define MAX_PALETTE_SIZE             256
+#define MAX_CACHE_BITS               11
+#define HUFFMAN_CODES_PER_META_CODE  5
+#define ARGB_BLACK                   0xff000000
+
+#define DEFAULT_CODE_LENGTH          8
+#define MAX_ALLOWED_CODE_LENGTH      15
+
+#define NUM_LITERAL_CODES            256
+#define NUM_LENGTH_CODES             24
+#define NUM_DISTANCE_CODES           40
+#define CODE_LENGTH_CODES            19
+
+#define MIN_HUFFMAN_BITS             2  // min number of Huffman bits
+#define MAX_HUFFMAN_BITS             9  // max number of Huffman bits
+
+#define TRANSFORM_PRESENT            1  // The bit to be written when next data
+                                        // to be read is a transform.
+#define NUM_TRANSFORMS               4  // Maximum number of allowed transform
+                                        // in a bitstream.
+typedef enum {
+  PREDICTOR_TRANSFORM      = 0,
+  CROSS_COLOR_TRANSFORM    = 1,
+  SUBTRACT_GREEN           = 2,
+  COLOR_INDEXING_TRANSFORM = 3
+} VP8LImageTransformType;
+
+// Alpha related constants.
+#define ALPHA_HEADER_LEN            1
+#define ALPHA_NO_COMPRESSION        0
+#define ALPHA_LOSSLESS_COMPRESSION  1
+#define ALPHA_PREPROCESSED_LEVELS   1
+
+// Mux related constants.
+#define TAG_SIZE           4     // Size of a chunk tag (e.g. "VP8L").
+#define CHUNK_SIZE_BYTES   4     // Size needed to store chunk's size.
+#define CHUNK_HEADER_SIZE  8     // Size of a chunk header.
+#define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
+#define ANMF_CHUNK_SIZE    16    // Size of an ANMF chunk.
+#define ANIM_CHUNK_SIZE    6     // Size of an ANIM chunk.
+#define FRGM_CHUNK_SIZE    6     // Size of a FRGM chunk.
+#define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
+
+#define MAX_CANVAS_SIZE     (1 << 24)     // 24-bit max for VP8X width/height.
+#define MAX_IMAGE_AREA      (1ULL << 32)  // 32-bit max for width x height.
+#define MAX_LOOP_COUNT      (1 << 16)     // maximum value for loop-count
+#define MAX_DURATION        (1 << 24)     // maximum duration
+#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame/fragment x/y offset
+
+// Maximum chunk payload is such that adding the header and padding won't
+// overflow a uint32_t.
+#define MAX_CHUNK_PAYLOAD (~0U - CHUNK_HEADER_SIZE - 1)
+
+#endif  /* WEBP_WEBP_FORMAT_CONSTANTS_H_ */
diff --git a/src/loaders/webp/webp/meson.build b/src/loaders/webp/webp/meson.build
new file mode 100644
index 00000000..cbdcd4b5
--- /dev/null
+++ b/src/loaders/webp/webp/meson.build
@@ -0,0 +1,10 @@
+source_file = [
+   'decode.h',
+   'format_constants.h',
+   'types.h'
+]
+
+webp_deb += [declare_dependency(
+    include_directories : include_directories('.'),
+    sources : source_file
+)]
\ No newline at end of file
diff --git a/src/loaders/webp/webp/types.h b/src/loaders/webp/webp/types.h
new file mode 100644
index 00000000..008fcbfb
--- /dev/null
+++ b/src/loaders/webp/webp/types.h
@@ -0,0 +1,53 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//  Common types
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_WEBP_TYPES_H_
+#define WEBP_WEBP_TYPES_H_
+
+#include <stddef.h>  // for size_t
+
+#ifndef _MSC_VER
+#include <inttypes.h>
+#if defined(__cplusplus) || !defined(__STRICT_ANSI__) || \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#define WEBP_INLINE inline
+#else
+#define WEBP_INLINE
+#endif
+#else
+typedef signed   char int8_t;
+typedef unsigned char uint8_t;
+typedef signed   short int16_t;
+typedef unsigned short uint16_t;
+typedef signed   int int32_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long int uint64_t;
+typedef long long int int64_t;
+#define WEBP_INLINE __forceinline
+#endif  /* _MSC_VER */
+
+#ifndef WEBP_EXTERN
+// This explicitly marks library functions and allows for changing the
+// signature for e.g., Windows DLL builds.
+# if defined(__GNUC__) && __GNUC__ >= 4
+#  define WEBP_EXTERN(type) extern __attribute__ ((visibility ("default"))) type
+# else
+#  define WEBP_EXTERN(type) extern type
+# endif  /* __GNUC__ >= 4 */
+
+#endif  /* WEBP_EXTERN */
+
+// Macro to check ABI compatibility (same major revision number)
+#define WEBP_ABI_IS_INCOMPATIBLE(a, b) (((a) >> 8) != ((b) >> 8))
+
+#endif  /* WEBP_WEBP_TYPES_H_ */