Simd: address RLE mask as bytes instead.

Add an endianness swap for the endangered big-endian species.

With: https://github.com/CollaboraOnline/benchmark

make coolbench && ./coolbench /opt/libreoffice/benchmark/*/*.png

fastest times:

before took: 1956ms - time/rle: 39.13us
after  took: 1671ms - time/rle: 33.43us

1.17x faster.

Signed-off-by: Michael Meeks <michael.meeks@collabora.com>
Change-Id: I1873ffd618f20248ade6741cc5ad269b04d3bba2
private/marco/calc-tabs-position
Michael Meeks 2024-03-09 17:45:20 +00:00 committed by Caolán McNamara
parent 6e0cde3596
commit 7a944ac3cd
1 changed files with 14 additions and 9 deletions

View File

@ -15,6 +15,8 @@
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <endian.h>
#include "DeltaSimd.h"
#if ENABLE_SIMD
@ -93,7 +95,7 @@ void simd_deltaInit(void)
}
// accelerated compression of a 256 pixel run
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlock)
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
{
#if !ENABLE_SIMD
// no fun.
@ -103,7 +105,8 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
#else // ENABLE_SIMD
*scratchLen = 0;
for (unsigned int x = 0; x < 4; ++x)
uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
for (unsigned int x = 0; x < 256/8; ++x)
rleMaskBlock[x] = 0;
const uint32_t* block = from;
@ -134,12 +137,10 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
assert (newMask < 256);
// invert bitmask for counting non-same foo ... [!]
uint8_t newMaskInverse = ~newMask;
{
unsigned int nMask = x >> 6; // 64 bits per mask
unsigned int i = (x >> 3) & 0x7; // chunk of bits we work on
rleMaskBlock[nMask] |= newMask << (i * 8);
}
uint32_t newMaskInverse = ~newMask & 0xff;
// stash our mask for these 8 pixels
rleMaskBlock[x>>3] = newMask;
// Shuffle the pixels and pack them
__m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
@ -171,8 +172,12 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
}
*scratchLen += dest - scratch;
// a no-op for LE architectures - ~everyone.
for (unsigned int x = 0; x < 4; ++x)
rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);
return 1;
#endif
#endif // ENABLE_SIMD
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */