diff --git a/src/lib/sw_engine/tvgSwCommon.h b/src/lib/sw_engine/tvgSwCommon.h index a0d18ddd..6ad2df41 100644 --- a/src/lib/sw_engine/tvgSwCommon.h +++ b/src/lib/sw_engine/tvgSwCommon.h @@ -275,6 +275,14 @@ static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a) ((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff)); } +#if defined(THORVG_NEON_VECTOR_SUPPORT) +static inline uint8x8_t ALPHA_BLEND_NEON(uint8x8_t c, uint8x8_t a) +{ + uint16x8_t t = vmull_u8(c, a); + return vshrn_n_u16(t, 8); +} +#endif + static inline uint32_t COLOR_INTERPOLATE(uint32_t c1, uint32_t a1, uint32_t c2, uint32_t a2) { auto t = (((c1 & 0xff00ff) * a1 + (c2 & 0xff00ff) * a2) >> 8) & 0xff00ff; diff --git a/src/lib/sw_engine/tvgSwRaster.cpp b/src/lib/sw_engine/tvgSwRaster.cpp index e4e86b5e..5fcc81ab 100644 --- a/src/lib/sw_engine/tvgSwRaster.cpp +++ b/src/lib/sw_engine/tvgSwRaster.cpp @@ -215,12 +215,35 @@ static bool _translucentRle(SwSurface* surface, const SwRleData* rle, uint32_t c for (uint32_t i = 0; i < rle->size; ++i) { auto dst = &surface->buffer[span->y * surface->stride + span->x]; + +#if defined(THORVG_NEON_VECTOR_SUPPORT) + uint8x8_t *vDst = (uint8x8_t*) dst; +#endif + if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage); else src = color; auto ialpha = 255 - surface->blender.alpha(src); + +#if defined(THORVG_NEON_VECTOR_SUPPORT) + uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src); + uint8x8_t vIalpha = (uint8x8_t) vdup_n_u32(ialpha); + + uint32_t iterations = span->len / 2; + uint32_t left = span->len % 2; + + for (uint32_t x = 0; x < iterations; x+=2) { + vDst[x] = vadd_u8(vSrc, ALPHA_BLEND_NEON(vDst[x], vIalpha)); + } + + if (left) { + dst[span->len] = src + ALPHA_BLEND(dst[span->len], ialpha); + } +#else + for (uint32_t x = 0; x < span->len; ++x) { dst[x] = src + ALPHA_BLEND(dst[x], ialpha); } +#endif ++span; } return true;