Skip to content

Instantly share code, notes, and snippets.

@Rexicon226
Created November 28, 2025 04:54
Show Gist options
  • Select an option

  • Save Rexicon226/0ebd1f4ba873ef35ee6b0374abb3c168 to your computer and use it in GitHub Desktop.

Select an option

Save Rexicon226/0ebd1f4ba873ef35ee6b0374abb3c168 to your computer and use it in GitHub Desktop.
RVV indexOfSentinel benchmark
const std = @import("std");
const iterations_per_byte = 1000;
const warmup_iterations = 10;
pub fn main() !void {
const allocator = std.heap.smp_allocator;
// Pin the process to a single core (1)
const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1));
try std.os.linux.sched_setaffinity(0, &cpu0001);
var stdout = std.fs.File.stdout().writer(&.{});
const writer = &stdout.interface;
const loops = try std.process.argsAlloc(allocator);
defer std.process.argsFree(allocator, loops);
const max_bytes = try std.fmt.parseInt(usize, loops[1], 10);
const pow_max_bytes = try std.math.powi(usize, 2, max_bytes);
const buffer = try allocator.alignedAlloc(u8, .of(u64), pow_max_bytes);
for (3..max_bytes) |N| {
const index = try std.math.powi(usize, 2, N);
const slice = buffer[0..index];
// worst case scenario, sentinel is at the last byte, we need to scan over everything
@memset(slice, 0xAA);
slice[index - 8 ..][0..8].* = @splat(0); // for all our versions u8...u64 we have a sentinel
try writer.print("{},", .{index});
inline for (.{
.{ indexOfSentinel, "rvv" },
.{ std.mem.indexOfSentinel, "stdlib" },
}) |impl| {
const func, const name = impl;
_ = name;
var i: u32 = 0;
var cycles: u64 = 0;
while (i < iterations_per_byte + warmup_iterations) : (i += 1) {
const start = rdtsc();
std.mem.doNotOptimizeAway(func(u64, 0, @ptrCast(slice)));
const end = rdtsc();
if (i > warmup_iterations) cycles += (end - start);
}
const cycles_per_byte = cycles / iterations_per_byte;
try writer.print("{d},", .{cycles_per_byte});
}
try writer.writeAll("\n");
}
}
pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize {
const size = switch (@bitSizeOf(T)) {
8, 16, 32, 64 => |size| size,
else => @compileError("unsupported size"),
};
const clobber: std.builtin.assembly.Clobbers = .{
.x11 = true, // a1
.x12 = true, // a2
.x13 = true, // a3
.x14 = true, // a4
.v0 = true,
.v8 = true,
};
return asm volatile (std.fmt.comptimePrint(
\\ mv a3, %[ptr] # Save start
\\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length
\\
\\1:
\\ vle{[bit_size]}ff.v v8, (a3) # Load bytes
\\ li a1, %[sentinel] # Load sentinel into register
\\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel
\\ csrr a1, vl # Get bytes read
\\ vfirst.m a2, v0 # Find first set bit
\\ add a3, a3, a1 # Bump pointer
\\ bltz a2, 1b # Not found?
\\
\\ add a4, %[ptr], a1 # Sum start + bump
\\ add a3, a3, a2 # Add index
\\ sub %[result], a3, a4 # Subtract start address + bump
, .{ .bit_size = size })
: [result] "=r" (-> usize),
: [ptr] "r" (p),
[sentinel] "i" (sentinel),
: clobber);
}
fn rdtsc() usize {
var cycle: u64 = 0;
asm volatile ("rdtime %[out]"
: [out] "=r" (cycle),
);
return cycle;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment