sw_engine common: Added neon alpha blending.

Changes:
- Prepare neon verison of ALPHA_BLEND API.
- Use ALPHA_BLEND_NEON in _translucentRle

Notes:
- _translucentRle with neon support reduces execution time of this
function ~ 300 % (measured on uint32_t 400 x 400 buffer).
- API was tested on ARMv7l device with GCC 9.2 based toolchain. Results
  on other devices could be different.
This commit is contained in:
Michal Szczecinski 2021-08-03 15:33:29 +02:00 committed by Hermet Park
parent a042e771e4
commit 3ce0722ffd
2 changed files with 31 additions and 0 deletions

View file

@ -275,6 +275,14 @@ static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a)
((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff)); ((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff));
} }
#if defined(THORVG_NEON_VECTOR_SUPPORT)
static inline uint8x8_t ALPHA_BLEND_NEON(uint8x8_t c, uint8x8_t a)
{
uint16x8_t t = vmull_u8(c, a);
return vshrn_n_u16(t, 8);
}
#endif
static inline uint32_t COLOR_INTERPOLATE(uint32_t c1, uint32_t a1, uint32_t c2, uint32_t a2) static inline uint32_t COLOR_INTERPOLATE(uint32_t c1, uint32_t a1, uint32_t c2, uint32_t a2)
{ {
auto t = (((c1 & 0xff00ff) * a1 + (c2 & 0xff00ff) * a2) >> 8) & 0xff00ff; auto t = (((c1 & 0xff00ff) * a1 + (c2 & 0xff00ff) * a2) >> 8) & 0xff00ff;

View file

@ -215,12 +215,35 @@ static bool _translucentRle(SwSurface* surface, const SwRleData* rle, uint32_t c
for (uint32_t i = 0; i < rle->size; ++i) { for (uint32_t i = 0; i < rle->size; ++i) {
auto dst = &surface->buffer[span->y * surface->stride + span->x]; auto dst = &surface->buffer[span->y * surface->stride + span->x];
#if defined(THORVG_NEON_VECTOR_SUPPORT)
uint8x8_t *vDst = (uint8x8_t*) dst;
#endif
if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage); if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
else src = color; else src = color;
auto ialpha = 255 - surface->blender.alpha(src); auto ialpha = 255 - surface->blender.alpha(src);
#if defined(THORVG_NEON_VECTOR_SUPPORT)
uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src);
uint8x8_t vIalpha = (uint8x8_t) vdup_n_u32(ialpha);
uint32_t iterations = span->len / 2;
uint32_t left = span->len % 2;
for (uint32_t x = 0; x < iterations; x+=2) {
vDst[x] = vadd_u8(vSrc, ALPHA_BLEND_NEON(vDst[x], vIalpha));
}
if (left) {
dst[span->len] = src + ALPHA_BLEND(dst[span->len], ialpha);
}
#else
for (uint32_t x = 0; x < span->len; ++x) { for (uint32_t x = 0; x < span->len; ++x) {
dst[x] = src + ALPHA_BLEND(dst[x], ialpha); dst[x] = src + ALPHA_BLEND(dst[x], ialpha);
} }
#endif
++span; ++span;
} }
return true; return true;