Simd: address RLE mask as bytes instead.

Add an endianness swap for the endangered big-endian species. With: https://github.com/CollaboraOnline/benchmark make coolbench && ./coolbench /opt/libreoffice/benchmark/*/*.png fastest times: before took: 1956ms - time/rle: 39.13us after took: 1671ms - time/rle: 33.43us 1.17x faster. Signed-off-by: Michael Meeks <michael.meeks@collabora.com> Change-Id: I1873ffd618f20248ade6741cc5ad269b04d3bba2
2024-03-09 17:45:20 +00:00 · 2024-03-09 17:45:20 +00:00 · 7a944ac3cd
parent 6e0cde3596
commit 7a944ac3cd
1 changed files with 14 additions and 9 deletions
--- a/kit/DeltaSimd.c
+++ b/kit/DeltaSimd.c
@ -15,6 +15,8 @@
 #include <assert.h>
 #include <string.h>
 #include <stdio.h>
+#include <endian.h>
+
 #include "DeltaSimd.h"

 #if ENABLE_SIMD
@ -93,7 +95,7 @@ void simd_deltaInit(void)
 }

 // accelerated compression of a 256 pixel run
-int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlock)
+int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
 {
 #if !ENABLE_SIMD
    // no fun.
@ -103,7 +105,8 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
 #else // ENABLE_SIMD

    *scratchLen = 0;
-    for (unsigned int x = 0; x < 4; ++x)
+    uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
+    for (unsigned int x = 0; x < 256/8; ++x)
        rleMaskBlock[x] = 0;

    const uint32_t* block = from;
@ -134,12 +137,10 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
        assert (newMask < 256);

        // invert bitmask for counting non-same foo ... [!]
-        uint8_t newMaskInverse = ~newMask;
-        {
-            unsigned int nMask = x >> 6; // 64 bits per mask
-            unsigned int i = (x >> 3) & 0x7; // chunk of bits we work on
-            rleMaskBlock[nMask] |= newMask << (i * 8);
-        }
+        uint32_t newMaskInverse = ~newMask & 0xff;
+
+        // stash our mask for these 8 pixels
+        rleMaskBlock[x>>3] = newMask;

        // Shuffle the pixels and pack them
        __m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
@ -171,8 +172,12 @@ int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratch
    }
    *scratchLen += dest - scratch;

+    // a no-op for LE architectures - ~everyone.
+    for (unsigned int x = 0; x < 4; ++x)
+        rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);
+
    return 1;
-#endif
+#endif // ENABLE_SIMD
 }

 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */