mirror of
https://github.com/thorvg/thorvg.git
synced 2025-06-14 12:04:29 +00:00
sw_engine: optimize 64-bit rasterization.
Write the pixels per 64-bit data. This optimization can potentially double the performance in the best case scenario. It's visually noticeable, especially when the resolution is sufficiently large. @Issue: https://github.com/thorvg/thorvg/issues/1471
This commit is contained in:
parent
a597d3bb49
commit
8305383989
2 changed files with 65 additions and 20 deletions
|
@ -259,7 +259,7 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t
|
|||
}
|
||||
|
||||
|
||||
void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len)
|
||||
void _rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len)
|
||||
{
|
||||
cRasterPixels(dst, val, offset, len);
|
||||
}
|
||||
|
@ -391,12 +391,11 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r
|
|||
rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w);
|
||||
}
|
||||
return true;
|
||||
//8bits grayscale
|
||||
}
|
||||
//8bits grayscale
|
||||
if (surface->channelSize == sizeof(uint8_t)) {
|
||||
auto buffer = surface->buf8 + (region.min.y * surface->stride);
|
||||
for (uint32_t y = 0; y < h; ++y) {
|
||||
_rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w);
|
||||
_rasterGrayscale8(surface->buf8, 255, region.min.y * surface->stride + region.min.x, w);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -565,7 +564,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r,
|
|||
//8bit grayscale
|
||||
} else if (surface->channelSize == sizeof(uint8_t)) {
|
||||
for (uint32_t i = 0; i < rle->size; ++i, ++span) {
|
||||
_rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len);
|
||||
_rasterGrayscale8(surface->buf8, span->coverage, span->y * surface->stride + span->x, span->len);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
@ -1637,7 +1636,7 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
|
|||
#elif defined(THORVG_NEON_VECTOR_SUPPORT)
|
||||
neonRasterRGBA32(dst, val, offset, len);
|
||||
#else
|
||||
cRasterPixels<uint32_t>(dst, val, offset, len);
|
||||
cRasterPixels(dst, val, offset, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1672,24 +1671,22 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_
|
|||
if (surface->channelSize == sizeof(uint32_t)) {
|
||||
//full clear
|
||||
if (w == surface->stride) {
|
||||
rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h);
|
||||
rasterRGBA32(surface->buf32, 0x00000000, surface->stride * y, w * h);
|
||||
//partial clear
|
||||
} else {
|
||||
auto buffer = surface->buf32 + (surface->stride * y + x);
|
||||
for (uint32_t i = 0; i < h; i++) {
|
||||
rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w);
|
||||
rasterRGBA32(surface->buf32, 0x00000000, (surface->stride * y + x) + (surface->stride * i), w);
|
||||
}
|
||||
}
|
||||
//8 bits
|
||||
} else if (surface->channelSize == sizeof(uint8_t)) {
|
||||
//full clear
|
||||
if (w == surface->stride) {
|
||||
_rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h);
|
||||
_rasterGrayscale8(surface->buf8, 0x00, surface->stride * y, w * h);
|
||||
//partial clear
|
||||
} else {
|
||||
auto buffer = surface->buf8 + (surface->stride * y + x);
|
||||
for (uint32_t i = 0; i < h; i++) {
|
||||
_rasterGrayscale8(buffer + (surface->stride * i), 0x00, 0, w);
|
||||
_rasterGrayscale8(surface->buf8, 0x00, (surface->stride * y + x) + (surface->stride * i), w);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,9 +21,43 @@
|
|||
*/
|
||||
|
||||
template<typename PIXEL_T>
|
||||
static void inline cRasterPixels(PIXEL_T* dst, uint32_t val, uint32_t offset, int32_t len)
|
||||
static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len)
|
||||
{
|
||||
dst += offset;
|
||||
|
||||
//fix the misaligned memory
|
||||
auto alignOffset = 0;
|
||||
if (sizeof(PIXEL_T) == 4) alignOffset = offset % 2;
|
||||
else if (sizeof(PIXEL_T) == 1) {
|
||||
alignOffset = offset % 8;
|
||||
if (alignOffset > 0) alignOffset = 8 - alignOffset;
|
||||
}
|
||||
|
||||
while (alignOffset > 0 && len > 0) {
|
||||
*dst++ = val;
|
||||
--len;
|
||||
--alignOffset;
|
||||
}
|
||||
|
||||
//64bits faster clear
|
||||
if ((sizeof(PIXEL_T) == 4)) {
|
||||
auto val64 = (uint64_t(val) << 32) | uint64_t(val);
|
||||
while (len > 1) {
|
||||
*reinterpret_cast<uint64_t*>(dst) = val64;
|
||||
len -= 2;
|
||||
dst += 2;
|
||||
}
|
||||
} else if (sizeof(PIXEL_T) == 1) {
|
||||
auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val);
|
||||
auto val64 = (uint64_t(val32) << 32) | val32;
|
||||
while (len > 7) {
|
||||
*reinterpret_cast<uint64_t*>(dst) = val64;
|
||||
len -= 8;
|
||||
dst += 8;
|
||||
}
|
||||
}
|
||||
|
||||
//leftovers
|
||||
while (len--) *dst++ = val;
|
||||
}
|
||||
|
||||
|
@ -97,13 +131,27 @@ static bool inline cRasterABGRtoARGB(Surface* surface)
|
|||
{
|
||||
TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h);
|
||||
|
||||
auto buffer = surface->buf32;
|
||||
for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
|
||||
auto dst = buffer;
|
||||
for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
|
||||
auto c = *dst;
|
||||
//flip Blue, Red channels
|
||||
*dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
|
||||
//64bits faster converting
|
||||
if (surface->w % 2 == 0) {
|
||||
auto buffer = reinterpret_cast<uint64_t*>(surface->buf32);
|
||||
for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) {
|
||||
auto dst = buffer;
|
||||
for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) {
|
||||
auto c = *dst;
|
||||
//flip Blue, Red channels
|
||||
*dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16);
|
||||
}
|
||||
}
|
||||
//default converting
|
||||
} else {
|
||||
auto buffer = surface->buf32;
|
||||
for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
|
||||
auto dst = buffer;
|
||||
for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
|
||||
auto c = *dst;
|
||||
//flip Blue, Red channels
|
||||
*dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
Loading…
Add table
Reference in a new issue