collabora-online/kit/DeltaSimd.c

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
 * Copyright the Collabora Online contributors.
 *
 * SPDX-License-Identifier: MPL-2.0
 */

// This is a C file - to avoid inclusion of C++ headers
// since compiling with different instruction set can generate
// versions of inlined code that get injected outside of this
// module by the linker.

#include "config.h"

#include <assert.h>
#include <string.h>
#include <stdio.h>
#include <endian.h>

#include "DeltaSimd.h"

#if ENABLE_SIMD
#  include <immintrin.h>

#define DEBUG_LUT 0

// set of control data bytes for vperd
static __m256i vpermd_lut[256];
static __m256i vpermd_shift_left;
static __m256i vpermd_last_to_first;
static __m256i low_pixel_mask;

// Build table we can lookup bitmasks in to generate gather data
void init_gather_lut()
{
    for (unsigned int pattern = 0; pattern < 256; ++pattern)
    {
        unsigned int i = 0, src = 0;
        uint8_t lut[8];
        for (uint32_t bitToCheck = 1; bitToCheck < 256; bitToCheck <<= 1)
        {
            if (!(pattern & bitToCheck)) // set bit is a duplicate -> ignore.
                lut[i++] = src;
            src++;
        }
        while (i<8) // pad to copy first point
                lut[i++] = 0;

#if DEBUG_LUG
        fprintf(stderr, "lut mask: 0x%x generates %d %d %d %d %d %d %d %d\n",
                pattern, lut[7], lut[6], lut[5], lut[4], lut[3], lut[2], lut[1], lut[0]);
#endif
        vpermd_lut[pattern] = _mm256_set_epi8(
            0, 0, 0, lut[7],  0, 0, 0, lut [6],
            0, 0, 0, lut[5],  0, 0, 0, lut [4],
            0, 0, 0, lut[3],  0, 0, 0, lut [2],
            0, 0, 0, lut[1],  0, 0, 0, lut [0]);
    }

    vpermd_shift_left = _mm256_set_epi8(
        0, 0, 0, 6,  0, 0, 0, 5,
        0, 0, 0, 4,  0, 0, 0, 3,
        0, 0, 0, 2,  0, 0, 0, 1,
        0, 0, 0, 0,  0, 0, 0, 0);

    vpermd_last_to_first = _mm256_set_epi8(
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0, 0, 0, 7);

    low_pixel_mask = _mm256_set_epi8(
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0, 0, 0, 0,
        0, 0, 0, 0,  0xff, 0xff, 0xff, 0xff);
}

// non-intuitively we need to use the sign bit as
// if floats to gather bits from 32bit words
static uint64_t diffMask(__m256i prev, __m256i curr)
{
    __m256i res = _mm256_cmpeq_epi32(prev, curr);
    __m256 m256 = _mm256_castsi256_ps(res);
    return _mm256_movemask_ps(m256);
}

#endif

void simd_deltaInit(void)
{
#if ENABLE_SIMD
    init_gather_lut();
#endif
}

// accelerated compression of a 256 pixel run
int simd_initPixRowSimd(const uint32_t *from, uint32_t *scratch, size_t *scratchLen, uint64_t *rleMaskBlockWide)
{
#if !ENABLE_SIMD
    // no fun.
    (void)from; (void)scratch; (void)scratchLen; (void)rleMaskBlockWide;
    return 0;

#else // ENABLE_SIMD

    *scratchLen = 0;
    uint8_t *rleMaskBlock = (uint8_t *)rleMaskBlockWide;
    for (unsigned int x = 0; x < 256/8; ++x)
        rleMaskBlock[x] = 0;

    const uint32_t* block = from;
    uint32_t* dest = scratch;
    __m256i prev = _mm256_setzero_si256(); // transparent

    for (unsigned int x = 0; x < 256; x += 8) // 8 pixels per cycle
    {
        __m256i curr = _mm256_loadu_si256((const __m256i_u*)(block + x));

        // Generate mask

        // get the last pixel into the least significant pixel
// FIXME: mask at the same time ?
//        __m256i lastPix = _mm256_maskz_permutexvar_epi32(0x1, prev, vpermd_last_to_first);
        __m256i lastPix = _mm256_permutevar8x32_epi32(prev, vpermd_last_to_first);
        lastPix = _mm256_and_si256(low_pixel_mask, lastPix);

        // shift the current pixels left
        prev = _mm256_permutevar8x32_epi32(curr, vpermd_shift_left);
        // mask out the bottom pixel
        prev = _mm256_andnot_si256(low_pixel_mask, prev);
        // merge in the last pixel
        prev = _mm256_or_si256(prev, lastPix);

        // turn that into a bit-mask.
        uint64_t newMask = diffMask(prev, curr);
        assert (newMask < 256);

        // invert bitmask for counting non-same foo ... [!]
        uint32_t newMaskInverse = ~newMask & 0xff;

        // stash our mask for these 8 pixels
        rleMaskBlock[x>>3] = newMask;

        // Shuffle the pixels and pack them
        __m256i control_vector = _mm256_loadu_si256(&vpermd_lut[newMask]);
        __m256i packed = _mm256_permutevar8x32_epi32(curr, control_vector);

        unsigned int countBitsUnset = _mm_popcnt_u32(newMaskInverse);
        assert(countBitsUnset <= 8);

        // over-store in dest: we are guaranteed enough space worst-case
        _mm256_storeu_si256((__m256i*)dest, packed);

#if DEBUG_LUT
        if (countBitsUnset > 0)
            fprintf(stderr, "for mask: 0x%2x bits-unset %d we have:\n"
                    "%4x%4x%4x%4x%4x%4x%4x%4x\n"
                    "%4x%4x%4x%4x%4x%4x%4x%4x\n",
                    (unsigned int)newMask, countBitsUnset,
                    block[x + 0], block[x + 1], block[x + 2], block[x + 3],
                    block[x + 4], block[x + 5], block[x + 6], block[x + 7],
                    dest[0], dest[1], dest[2], dest[3],
                    dest[4], dest[5], dest[6], dest[7]);
#endif

        // move on for the next run.
        dest += countBitsUnset;

        // stash current for use next time around
        prev = curr;
    }
    *scratchLen += dest - scratch;

    // a no-op for LE architectures - ~everyone.
    for (unsigned int x = 0; x < 4; ++x)
        rleMaskBlockWide[x] = htole64(rleMaskBlockWide[x]);

    return 1;
#endif // ENABLE_SIMD
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */