sw_engine: optimize 64-bit rasterization.

Write the pixels per 64-bit data.
This optimization can potentially double the performance in the best case scenario.
It's visually noticeable, especially when the resolution is sufficiently large.

@Issue: https://github.com/thorvg/thorvg/issues/1471
This commit is contained in:
Hermet Park 2023-06-12 15:06:56 +09:00 committed by Hermet Park
parent a597d3bb49
commit 8305383989
2 changed files with 65 additions and 20 deletions

View file

@ -259,7 +259,7 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t
} }
void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len) void _rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len)
{ {
cRasterPixels(dst, val, offset, len); cRasterPixels(dst, val, offset, len);
} }
@ -391,12 +391,11 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r
rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w); rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w);
} }
return true; return true;
//8bits grayscale
} }
//8bits grayscale
if (surface->channelSize == sizeof(uint8_t)) { if (surface->channelSize == sizeof(uint8_t)) {
auto buffer = surface->buf8 + (region.min.y * surface->stride);
for (uint32_t y = 0; y < h; ++y) { for (uint32_t y = 0; y < h; ++y) {
_rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w); _rasterGrayscale8(surface->buf8, 255, region.min.y * surface->stride + region.min.x, w);
} }
return true; return true;
} }
@ -565,7 +564,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r,
//8bit grayscale //8bit grayscale
} else if (surface->channelSize == sizeof(uint8_t)) { } else if (surface->channelSize == sizeof(uint8_t)) {
for (uint32_t i = 0; i < rle->size; ++i, ++span) { for (uint32_t i = 0; i < rle->size; ++i, ++span) {
_rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len); _rasterGrayscale8(surface->buf8, span->coverage, span->y * surface->stride + span->x, span->len);
} }
} }
return true; return true;
@ -1637,7 +1636,7 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
#elif defined(THORVG_NEON_VECTOR_SUPPORT) #elif defined(THORVG_NEON_VECTOR_SUPPORT)
neonRasterRGBA32(dst, val, offset, len); neonRasterRGBA32(dst, val, offset, len);
#else #else
cRasterPixels<uint32_t>(dst, val, offset, len); cRasterPixels(dst, val, offset, len);
#endif #endif
} }
@ -1672,24 +1671,22 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_
if (surface->channelSize == sizeof(uint32_t)) { if (surface->channelSize == sizeof(uint32_t)) {
//full clear //full clear
if (w == surface->stride) { if (w == surface->stride) {
rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h); rasterRGBA32(surface->buf32, 0x00000000, surface->stride * y, w * h);
//partial clear //partial clear
} else { } else {
auto buffer = surface->buf32 + (surface->stride * y + x);
for (uint32_t i = 0; i < h; i++) { for (uint32_t i = 0; i < h; i++) {
rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w); rasterRGBA32(surface->buf32, 0x00000000, (surface->stride * y + x) + (surface->stride * i), w);
} }
} }
//8 bits //8 bits
} else if (surface->channelSize == sizeof(uint8_t)) { } else if (surface->channelSize == sizeof(uint8_t)) {
//full clear //full clear
if (w == surface->stride) { if (w == surface->stride) {
_rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h); _rasterGrayscale8(surface->buf8, 0x00, surface->stride * y, w * h);
//partial clear //partial clear
} else { } else {
auto buffer = surface->buf8 + (surface->stride * y + x);
for (uint32_t i = 0; i < h; i++) { for (uint32_t i = 0; i < h; i++) {
_rasterGrayscale8(buffer + (surface->stride * i), 0x00, 0, w); _rasterGrayscale8(surface->buf8, 0x00, (surface->stride * y + x) + (surface->stride * i), w);
} }
} }
} }

View file

@ -21,9 +21,43 @@
*/ */
template<typename PIXEL_T> template<typename PIXEL_T>
static void inline cRasterPixels(PIXEL_T* dst, uint32_t val, uint32_t offset, int32_t len) static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len)
{ {
dst += offset; dst += offset;
//fix the misaligned memory
auto alignOffset = 0;
if (sizeof(PIXEL_T) == 4) alignOffset = offset % 2;
else if (sizeof(PIXEL_T) == 1) {
alignOffset = offset % 8;
if (alignOffset > 0) alignOffset = 8 - alignOffset;
}
while (alignOffset > 0 && len > 0) {
*dst++ = val;
--len;
--alignOffset;
}
//64bits faster clear
if ((sizeof(PIXEL_T) == 4)) {
auto val64 = (uint64_t(val) << 32) | uint64_t(val);
while (len > 1) {
*reinterpret_cast<uint64_t*>(dst) = val64;
len -= 2;
dst += 2;
}
} else if (sizeof(PIXEL_T) == 1) {
auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val);
auto val64 = (uint64_t(val32) << 32) | val32;
while (len > 7) {
*reinterpret_cast<uint64_t*>(dst) = val64;
len -= 8;
dst += 8;
}
}
//leftovers
while (len--) *dst++ = val; while (len--) *dst++ = val;
} }
@ -97,13 +131,27 @@ static bool inline cRasterABGRtoARGB(Surface* surface)
{ {
TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h); TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h);
auto buffer = surface->buf32; //64bits faster converting
for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) { if (surface->w % 2 == 0) {
auto dst = buffer; auto buffer = reinterpret_cast<uint64_t*>(surface->buf32);
for (uint32_t x = 0; x < surface->w; ++x, ++dst) { for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) {
auto c = *dst; auto dst = buffer;
//flip Blue, Red channels for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) {
*dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16); auto c = *dst;
//flip Blue, Red channels
*dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16);
}
}
//default converting
} else {
auto buffer = surface->buf32;
for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
auto dst = buffer;
for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
auto c = *dst;
//flip Blue, Red channels
*dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
}
} }
} }
return true; return true;