Skip to content

Instantly share code, notes, and snippets.

@loicmolinari
Created May 22, 2020 13:09
Show Gist options
  • Select an option

  • Save loicmolinari/7846f5197f02c87e764115e34c7f1114 to your computer and use it in GitHub Desktop.

Select an option

Save loicmolinari/7846f5197f02c87e764115e34c7f1114 to your computer and use it in GitHub Desktop.
Branch free 4xU8 elements ring buffer cache
typedef struct {
    U32 elements;
    S32 position;
}  Cache;

void
CacheInit(Cache* cache)
{
    ASSERT(cache);

    cache->elements = 0;
    cache->position = 0;
}

U8
CachePeek(Cache* cache, S32 index)
{
    ASSERT(cache);
    ASSERT(index >= 0 && index <= 3);

    return cache->elements >> (index << 3);
}

void
CachePush(Cache* cache, U8 element)
{
    ASSERT(cache);

    const U32 kOffset = cache->position << 3;
    cache->elements &= ~(0xff << kOffset);
    cache->elements |= element << kOffset;
    cache->position = (cache->position + 1) & 3;
}

// Returns the index of element if cached, 4 otherwise.
S32
CacheLookup(Cache* cache, U8 element)
{
    ASSERT(cache);

    // Use SIMD Within A Register (SWAR) to efficiently find a byte within a 32-bit word. First, set
    // matching bytes to 0 by XORing with value splat over a word. Then, set least significant bits
    // of 0 bytes (http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord). Finally, count
    // trailing bits to get the index. ORing kLsbs with 2^32 allows to avoid the undefined behavior
    // of __builtin_ctzll() with 0 (without branching) and to return 4 when the value is not found.
    const U32 kZeros = cache->elements ^ (element | element << 8 | element << 16 | element << 24);
    const U32 kLsbs = (kZeros - 0x01010101) & ~kZeros & 0x80808080;
    return __builtin_ctzll(kLsbs | 0x100000000) >> 3;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment