Skip to content

Instantly share code, notes, and snippets.

@leonardoalt
Last active March 4, 2026 22:19
Show Gist options
  • Select an option

  • Save leonardoalt/bf5e6fc8c9608047e3a5145aa302c935 to your computer and use it in GitHub Desktop.

Select an option

Save leonardoalt/bf5e6fc8c9608047e3a5145aa302c935 to your computer and use it in GitHub Desktop.
CUDA diffs: WOMIR HintStore vs upstream OpenVM rv32im HintStore
Diff of extensions/womir_circuit/cuda/src/hintstore.cu vs upstream extensions/rv32im/circuit/cuda/src/hintstore.cu
WOMIR-specific changes:
- WomirExecutionState (has fp) instead of ExecutionState
- fp and fp_read_aux fields in record header and columns
- Timestamp delta is 4 per row (fp_read=+0, mem_ptr_read=+1, num_words_read=+2, write=+3) instead of 3
- womir_ prefix on kernel/extern names
- Different includes (histogram.cuh, offline_checker.cuh, womir/execution.cuh)
--- /home/leo/.cargo/git/checkouts/openvm-77dd23e285a1262c/72e9013/extensions/rv32im/circuit/cuda/src/hintstore.cu 2026-03-02 19:09:24.445511386 +0100
+++ /home/leo/devel/womir-openvm/.claude/worktrees/cuda-hintstore-tracegen/extensions/womir_circuit/cuda/src/hintstore.cu 2026-03-04 23:18:19.510227234 +0100
@@ -1,11 +1,12 @@
#include "launcher.cuh"
#include "primitives/constants.h"
-#include "primitives/execution.h"
+#include "primitives/histogram.cuh"
#include "primitives/trace_access.h"
#include "system/memory/controller.cuh"
+#include "system/memory/offline_checker.cuh"
+#include "womir/execution.cuh"
using namespace riscv;
-using namespace program;
template <typename T> struct Rv32HintStoreCols {
// common
@@ -15,7 +16,8 @@
// should be 1 for single
T rem_words_limbs[RV32_REGISTER_NUM_LIMBS];
- ExecutionState<T> from_state;
+ WomirExecutionState<T> from_state;
+ MemoryReadAuxCols<T> fp_read_aux;
T mem_ptr_ptr;
T mem_ptr_limbs[RV32_REGISTER_NUM_LIMBS];
MemoryReadAuxCols<T> mem_ptr_aux_cols;
@@ -36,6 +38,9 @@
uint32_t from_pc;
uint32_t timestamp;
+ uint32_t fp;
+ MemoryReadAuxRecord fp_read_aux;
+
uint32_t mem_ptr_ptr;
uint32_t mem_ptr;
MemoryReadAuxRecord mem_ptr_aux_record;
@@ -72,7 +77,8 @@
uint32_t local_idx
) {
bool is_single = record.num_words_ptr == UINT32_MAX;
- uint32_t timestamp = record.timestamp + local_idx * 3;
+ // Timestamp delta is 4 per row: fp_read=+0, mem_ptr_read=+1, num_words_read=+2, write=+3
+ uint32_t timestamp = record.timestamp + local_idx * 4;
uint32_t rem_words = record.num_words - local_idx;
uint32_t mem_ptr = record.mem_ptr + local_idx * (uint32_t)RV32_REGISTER_NUM_LIMBS;
auto rem_words_limbs = reinterpret_cast<uint8_t *>(&rem_words);
@@ -82,6 +88,7 @@
COL_WRITE_VALUE(row, Rv32HintStoreCols, is_buffer, !is_single);
COL_WRITE_ARRAY(row, Rv32HintStoreCols, rem_words_limbs, rem_words_limbs);
COL_WRITE_VALUE(row, Rv32HintStoreCols, from_state.pc, record.from_pc);
+ COL_WRITE_VALUE(row, Rv32HintStoreCols, from_state.fp, record.fp);
COL_WRITE_VALUE(row, Rv32HintStoreCols, from_state.timestamp, timestamp);
COL_WRITE_VALUE(row, Rv32HintStoreCols, mem_ptr_ptr, record.mem_ptr_ptr);
COL_WRITE_ARRAY(row, Rv32HintStoreCols, mem_ptr_limbs, mem_ptr_limbs);
@@ -94,11 +101,17 @@
(record.num_words >> msl_rshift) << msl_lshift
);
mem_helper.fill(
+ row.slice_from(COL_INDEX(Rv32HintStoreCols, fp_read_aux)),
+ record.fp_read_aux.prev_timestamp,
+ timestamp
+ );
+ mem_helper.fill(
row.slice_from(COL_INDEX(Rv32HintStoreCols, mem_ptr_aux_cols)),
record.mem_ptr_aux_record.prev_timestamp,
- timestamp
+ timestamp + 1
);
} else {
+ mem_helper.fill_zero(row.slice_from(COL_INDEX(Rv32HintStoreCols, fp_read_aux)));
mem_helper.fill_zero(row.slice_from(COL_INDEX(Rv32HintStoreCols, mem_ptr_aux_cols)));
}
@@ -106,7 +119,7 @@
mem_helper.fill(
row.slice_from(COL_INDEX(Rv32HintStoreCols, num_words_aux_cols)),
record.num_words_read.prev_timestamp,
- timestamp + 1
+ timestamp + 2
);
COL_WRITE_VALUE(row, Rv32HintStoreCols, is_buffer_start, 1);
COL_WRITE_VALUE(row, Rv32HintStoreCols, num_words_ptr, record.num_words_ptr);
@@ -120,7 +133,7 @@
mem_helper.fill(
row.slice_from(COL_INDEX(Rv32HintStoreCols, write_aux)),
write.write_aux.prev_timestamp,
- timestamp + 2
+ timestamp + 3
);
COL_WRITE_ARRAY(row, Rv32HintStoreCols, data, write.data);
@@ -136,7 +149,7 @@
uint32_t local_idx;
};
-__global__ void hintstore_tracegen(
+__global__ void womir_hintstore_tracegen(
Fp *trace,
size_t height,
uint8_t *records,
@@ -174,7 +187,7 @@
}
}
-extern "C" int _hintstore_tracegen(
+extern "C" int _womir_hintstore_tracegen(
Fp *__restrict__ d_trace,
size_t height,
size_t width,
@@ -191,7 +204,7 @@
assert(width == sizeof(Rv32HintStoreCols<uint8_t>));
auto [grid, block] = kernel_launch_params(height, 512);
- hintstore_tracegen<<<grid, block>>>(
+ womir_hintstore_tracegen<<<grid, block>>>(
d_trace,
height,
d_records,
Diff of extensions/womir_circuit/src/hintstore/cuda.rs vs upstream extensions/rv32im/circuit/src/hintstore/cuda.rs
Only import path differences (RV32_CELL_BITS comes from crate::adapters instead of openvm_instructions::riscv).
The logic is identical to upstream.
--- /home/leo/.cargo/git/checkouts/openvm-77dd23e285a1262c/72e9013/extensions/rv32im/circuit/src/hintstore/cuda.rs 2026-03-02 19:09:24.448511400 +0100
+++ /home/leo/devel/womir-openvm/.claude/worktrees/cuda-hintstore-tracegen/extensions/womir_circuit/src/hintstore/cuda.rs 2026-03-04 23:11:50.669880875 +0100
@@ -12,12 +12,11 @@
base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F,
};
use openvm_cuda_common::copy::MemCopyH2D;
-use openvm_instructions::riscv::RV32_CELL_BITS;
-use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
+use openvm_stark_backend::{Chip, prover::types::AirProvingContext};
use crate::{
- cuda_abi::hintstore_cuda::tracegen, Rv32HintStoreCols, Rv32HintStoreLayout,
- Rv32HintStoreRecordMut,
+ Rv32HintStoreCols, Rv32HintStoreLayout, Rv32HintStoreRecordMut, adapters::RV32_CELL_BITS,
+ cuda_abi::hintstore_cuda::tracegen,
};
#[derive(new)]
Diff of the hintstore_cuda module in extensions/womir_circuit/src/cuda_abi.rs vs upstream extensions/rv32im/circuit/src/cuda_abi.rs
Only differences:
- womir_ prefix on FFI function name
- unsafe extern "C" (Rust 2024 edition compatibility)
- Inner unsafe block in tracegen wrapper
--- /tmp/upstream_hintstore_abi.rs 2026-03-04 23:13:51.610266348 +0100
+++ /tmp/womir_hintstore_abi.rs 2026-03-04 23:13:51.611266351 +0100
@@ -1,8 +1,8 @@
pub mod hintstore_cuda {
use super::{super::hintstore::OffsetInfo, *};
- extern "C" {
- pub fn _hintstore_tracegen(
+ unsafe extern "C" {
+ pub fn _womir_hintstore_tracegen(
d_trace: *mut F,
height: usize,
width: usize,
@@ -31,19 +31,21 @@
bitwise_num_bits: u32,
timestamp_max_bits: u32,
) -> Result<(), CudaError> {
- CudaError::from_result(_hintstore_tracegen(
- d_trace.as_mut_ptr(),
- height,
- d_trace.len() / height,
- d_records.as_ptr(),
- rows_used,
- d_record_offsets.as_ptr(),
- pointer_max_bits,
- d_range_checker.as_mut_ptr() as *mut u32,
- d_range_checker.len() as u32,
- d_bitwise_lookup.as_mut_ptr() as *mut u32,
- bitwise_num_bits,
- timestamp_max_bits,
- ))
+ unsafe {
+ CudaError::from_result(_womir_hintstore_tracegen(
+ d_trace.as_mut_ptr(),
+ height,
+ d_trace.len() / height,
+ d_records.as_ptr(),
+ rows_used,
+ d_record_offsets.as_ptr(),
+ pointer_max_bits,
+ d_range_checker.as_mut_ptr() as *mut u32,
+ d_range_checker.len() as u32,
+ d_bitwise_lookup.as_mut_ptr() as *mut u32,
+ bitwise_num_bits,
+ timestamp_max_bits,
+ ))
+ }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment