Created
November 28, 2025 04:54
-
-
Save Rexicon226/0ebd1f4ba873ef35ee6b0374abb3c168 to your computer and use it in GitHub Desktop.
RVV indexOfSentinel benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const std = @import("std"); | |
| const iterations_per_byte = 1000; | |
| const warmup_iterations = 10; | |
| pub fn main() !void { | |
| const allocator = std.heap.smp_allocator; | |
| // Pin the process to a single core (1) | |
| const cpu0001: std.os.linux.cpu_set_t = [1]usize{0b0001} ++ ([_]usize{0} ** (16 - 1)); | |
| try std.os.linux.sched_setaffinity(0, &cpu0001); | |
| var stdout = std.fs.File.stdout().writer(&.{}); | |
| const writer = &stdout.interface; | |
| const loops = try std.process.argsAlloc(allocator); | |
| defer std.process.argsFree(allocator, loops); | |
| const max_bytes = try std.fmt.parseInt(usize, loops[1], 10); | |
| const pow_max_bytes = try std.math.powi(usize, 2, max_bytes); | |
| const buffer = try allocator.alignedAlloc(u8, .of(u64), pow_max_bytes); | |
| for (3..max_bytes) |N| { | |
| const index = try std.math.powi(usize, 2, N); | |
| const slice = buffer[0..index]; | |
| // worst case scenario, sentinel is at the last byte, we need to scan over everything | |
| @memset(slice, 0xAA); | |
| slice[index - 8 ..][0..8].* = @splat(0); // for all our versions u8...u64 we have a sentinel | |
| try writer.print("{},", .{index}); | |
| inline for (.{ | |
| .{ indexOfSentinel, "rvv" }, | |
| .{ std.mem.indexOfSentinel, "stdlib" }, | |
| }) |impl| { | |
| const func, const name = impl; | |
| _ = name; | |
| var i: u32 = 0; | |
| var cycles: u64 = 0; | |
| while (i < iterations_per_byte + warmup_iterations) : (i += 1) { | |
| const start = rdtsc(); | |
| std.mem.doNotOptimizeAway(func(u64, 0, @ptrCast(slice))); | |
| const end = rdtsc(); | |
| if (i > warmup_iterations) cycles += (end - start); | |
| } | |
| const cycles_per_byte = cycles / iterations_per_byte; | |
| try writer.print("{d},", .{cycles_per_byte}); | |
| } | |
| try writer.writeAll("\n"); | |
| } | |
| } | |
| pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize { | |
| const size = switch (@bitSizeOf(T)) { | |
| 8, 16, 32, 64 => |size| size, | |
| else => @compileError("unsupported size"), | |
| }; | |
| const clobber: std.builtin.assembly.Clobbers = .{ | |
| .x11 = true, // a1 | |
| .x12 = true, // a2 | |
| .x13 = true, // a3 | |
| .x14 = true, // a4 | |
| .v0 = true, | |
| .v8 = true, | |
| }; | |
| return asm volatile (std.fmt.comptimePrint( | |
| \\ mv a3, %[ptr] # Save start | |
| \\ vsetvli a1, zero, e{[bit_size]}, m8, ta, ma # Vector of bytes of maximum length | |
| \\ | |
| \\1: | |
| \\ vle{[bit_size]}ff.v v8, (a3) # Load bytes | |
| \\ li a1, %[sentinel] # Load sentinel into register | |
| \\ vmseq.vx v0, v8, a1 # Set v0[i] where v8[i] = sentinel | |
| \\ csrr a1, vl # Get bytes read | |
| \\ vfirst.m a2, v0 # Find first set bit | |
| \\ add a3, a3, a1 # Bump pointer | |
| \\ bltz a2, 1b # Not found? | |
| \\ | |
| \\ add a4, %[ptr], a1 # Sum start + bump | |
| \\ add a3, a3, a2 # Add index | |
| \\ sub %[result], a3, a4 # Subtract start address + bump | |
| , .{ .bit_size = size }) | |
| : [result] "=r" (-> usize), | |
| : [ptr] "r" (p), | |
| [sentinel] "i" (sentinel), | |
| : clobber); | |
| } | |
| fn rdtsc() usize { | |
| var cycle: u64 = 0; | |
| asm volatile ("rdtime %[out]" | |
| : [out] "=r" (cycle), | |
| ); | |
| return cycle; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment