collabora-online/kit/DeltaSimd.c

184 lines
5.6 KiB
C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* Copyright the Collabora Online contributors.
*
* SPDX-License-Identifier: MPL-2.0
*/
// This is a C file - to avoid inclusion of C++ headers
// since compiling with different instruction set can generate
// versions of inlined code that get injected outside of this
// module by the linker.
#include "config.h"
#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <endian.h>
#include "DeltaSimd.h"
#if ENABLE_SIMD
# include <immintrin.h>
#define DEBUG_LUT 0
// set of control data bytes for vperd
static __m256i vpermd_lut[256];
static __m256i vpermd_shift_left;
static __m256i vpermd_last_to_first;
static __m256i low_pixel_mask;
// Build table we can lookup bitmasks in to generate gather data
void init_gather_lut()
{
for (unsigned int pattern = 0; pattern < 256; ++pattern)
{
unsigned int i = 0, src = 0;
uint8_t lut[8];
for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1)
{
if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore.
lut[i++] = src;
src++;
}
while (i<8) // pad to copy first point
lut[i++] = 0;
#if DEBUG_LUG
fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n",
pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]);
#endif
vpermd_lut[pattern] = _mm256_set_epi8(
0, 0, 0, lut[7], 0, 0, 0, lut [6],
0, 0, 0, lut[5], 0, 0, 0, lut [4],
0, 0, 0, lut[3], 0, 0, 0, lut [2],
0, 0, 0, lut[1], 0, 0, 0, lut [0]);
}
vpermd_shift_left = _mm256_set_epi8(
0, 0, 0, 6, 0, 0, 0, 5,
0, 0, 0, 4, 0, 0, 0, 3,
0, 0, 0, 2, 0, 0, 0, 1,
0, 0, 0, 0, 0, 0, 0, 0);
vpermd_last_to_first = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 7);
low_pixel_mask = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
}
// non-intuitively we need to use the sign bit as
// if floats to gather bits from 32bit words
static uint64_t diffMask(__m256i prev, __m256i curr)
{
__m256i res = _mm256_cmpeq_epi32(prev, curr);
__m256 m256 = _mm256_castsi256_ps(res);
return _mm256_movemask_ps(m256);
}
#endif
void simd_deltaInit(void)
{
#if ENABLE_SIMD
init_gather_lut();
#endif
}
// accelerated compression of a 256 pixel run
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
{
#if !ENABLE_SIMD
// no fun.
(void)from; (void)scratch; (void)scratchLen; (void)rleMaskBlockWide;
return 0;
#else // ENABLE_SIMD
*scratchLen = 0;
uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
for (unsigned int x = 0; x < 256/8; ++x)
rleMaskBlock[x] = 0;
const uint32_t* block = from;
uint32_t* dest = scratch;
__m256i prev = _mm256_setzero_si256(); // transparent
for (unsigned int x = 0; x < 256; x += 8) // 8 pixels per cycle
{
__m256i curr = _mm256_loadu_si256((const __m256i_u*)(block + x));
// Generate mask
// get the last pixel into the least significant pixel
// FIXME: mask at the same time ?
// __m256i lastPix = _mm256_maskz_permutexvar_epi32(0x1, prev, vpermd_last_to_first);
__m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_to_first);
lastPix = _mm256_and_si256(low_pixel_mask, lastPix);
// shift the current pixels left
prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left);
// mask out the bottom pixel
prev = _mm256_andnot_si256(low_pixel_mask, prev);
// merge in the last pixel
prev = _mm256_or_si256(prev, lastPix);
// turn that into a bit-mask.
uint64_t newMask = diffMask(prev, curr);
assert (newMask < 256);
// invert bitmask for counting non-same foo ... [!]
uint32_t newMaskInverse = ~newMask & 0xff;
// stash our mask for these 8 pixels
rleMaskBlock[x>>3] = newMask;
// Shuffle the pixels and pack them
__m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
__m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector);
unsigned int countBitsUnset = _mm_popcnt_u32(newMaskInverse);
assert(countBitsUnset <= 8);
// over-store in dest: we are guaranteed enough space worst-case
_mm256_storeu_si256((__m256i*)dest, packed);
#if DEBUG_LUT
if (countBitsUnset > 0)
fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n"
"%4x%4x%4x%4x%4x%4x%4x%4x\n",
(unsigned int)newMask, countBitsUnset,
block[x + 0], block[x + 1], block[x + 2], block[x + 3],
block[x + 4], block[x + 5], block[x + 6], block[x + 7],
dest[0], dest[1], dest[2], dest[3],
dest[4], dest[5], dest[6], dest[7]);
#endif
// move on for the next run.
dest += countBitsUnset;
// stash current for use next time around
prev = curr;
}
*scratchLen += dest - scratch;
// a no-op for LE architectures - ~everyone.
for (unsigned int x = 0; x < 4; ++x)
rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);
return 1;
#endif // ENABLE_SIMD
}
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */