From 8305383989a7b401bf31f80072f237d243d54a56 Mon Sep 17 00:00:00 2001 From: Hermet Park Date: Mon, 12 Jun 2023 15:06:56 +0900 Subject: [PATCH] sw_engine: optimize 64-bit rasterization. Write the pixels per 64-bit data. This optimization can potentially double the performance in the best case scenario. It's visually noticeable, especially when the resolution is sufficiently large. @Issue: https://github.com/thorvg/thorvg/issues/1471 --- src/lib/sw_engine/tvgSwRaster.cpp | 21 +++++----- src/lib/sw_engine/tvgSwRasterC.h | 64 +++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/src/lib/sw_engine/tvgSwRaster.cpp b/src/lib/sw_engine/tvgSwRaster.cpp index e43bbf7d..b8dc3246 100644 --- a/src/lib/sw_engine/tvgSwRaster.cpp +++ b/src/lib/sw_engine/tvgSwRaster.cpp @@ -259,7 +259,7 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t } -void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len) +void _rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len) { cRasterPixels(dst, val, offset, len); } @@ -391,12 +391,11 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w); } return true; - //8bits grayscale } + //8bits grayscale if (surface->channelSize == sizeof(uint8_t)) { - auto buffer = surface->buf8 + (region.min.y * surface->stride); for (uint32_t y = 0; y < h; ++y) { - _rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w); + _rasterGrayscale8(surface->buf8, 255, region.min.y * surface->stride + region.min.x, w); } return true; } @@ -565,7 +564,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r, //8bit grayscale } else if (surface->channelSize == sizeof(uint8_t)) { for (uint32_t i = 0; i < rle->size; ++i, ++span) { - _rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len); + _rasterGrayscale8(surface->buf8, span->coverage, span->y * surface->stride + span->x, span->len); } } return true; @@ -1637,7 +1636,7 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) #elif defined(THORVG_NEON_VECTOR_SUPPORT) neonRasterRGBA32(dst, val, offset, len); #else - cRasterPixels(dst, val, offset, len); + cRasterPixels(dst, val, offset, len); #endif } @@ -1672,24 +1671,22 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_ if (surface->channelSize == sizeof(uint32_t)) { //full clear if (w == surface->stride) { - rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h); + rasterRGBA32(surface->buf32, 0x00000000, surface->stride * y, w * h); //partial clear } else { - auto buffer = surface->buf32 + (surface->stride * y + x); for (uint32_t i = 0; i < h; i++) { - rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w); + rasterRGBA32(surface->buf32, 0x00000000, (surface->stride * y + x) + (surface->stride * i), w); } } //8 bits } else if (surface->channelSize == sizeof(uint8_t)) { //full clear if (w == surface->stride) { - _rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h); + _rasterGrayscale8(surface->buf8, 0x00, surface->stride * y, w * h); //partial clear } else { - auto buffer = surface->buf8 + (surface->stride * y + x); for (uint32_t i = 0; i < h; i++) { - _rasterGrayscale8(buffer + (surface->stride * i), 0x00, 0, w); + _rasterGrayscale8(surface->buf8, 0x00, (surface->stride * y + x) + (surface->stride * i), w); } } } diff --git a/src/lib/sw_engine/tvgSwRasterC.h b/src/lib/sw_engine/tvgSwRasterC.h index a040269f..a8c44d42 100644 --- a/src/lib/sw_engine/tvgSwRasterC.h +++ b/src/lib/sw_engine/tvgSwRasterC.h @@ -21,9 +21,43 @@ */ template -static void inline cRasterPixels(PIXEL_T* dst, uint32_t val, uint32_t offset, int32_t len) +static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len) { dst += offset; + + //fix the misaligned memory + auto alignOffset = 0; + if (sizeof(PIXEL_T) == 4) alignOffset = offset % 2; + else if (sizeof(PIXEL_T) == 1) { + alignOffset = offset % 8; + if (alignOffset > 0) alignOffset = 8 - alignOffset; + } + + while (alignOffset > 0 && len > 0) { + *dst++ = val; + --len; + --alignOffset; + } + + //64bits faster clear + if ((sizeof(PIXEL_T) == 4)) { + auto val64 = (uint64_t(val) << 32) | uint64_t(val); + while (len > 1) { + *reinterpret_cast(dst) = val64; + len -= 2; + dst += 2; + } + } else if (sizeof(PIXEL_T) == 1) { + auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val); + auto val64 = (uint64_t(val32) << 32) | val32; + while (len > 7) { + *reinterpret_cast(dst) = val64; + len -= 8; + dst += 8; + } + } + + //leftovers while (len--) *dst++ = val; } @@ -97,13 +131,27 @@ static bool inline cRasterABGRtoARGB(Surface* surface) { TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h); - auto buffer = surface->buf32; - for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) { - auto dst = buffer; - for (uint32_t x = 0; x < surface->w; ++x, ++dst) { - auto c = *dst; - //flip Blue, Red channels - *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16); + //64bits faster converting + if (surface->w % 2 == 0) { + auto buffer = reinterpret_cast(surface->buf32); + for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) { + auto dst = buffer; + for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) { + auto c = *dst; + //flip Blue, Red channels + *dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16); + } + } + //default converting + } else { + auto buffer = surface->buf32; + for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) { + auto dst = buffer; + for (uint32_t x = 0; x < surface->w; ++x, ++dst) { + auto c = *dst; + //flip Blue, Red channels + *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16); + } } } return true;