Skip to content

Instantly share code, notes, and snippets.

@akhildevelops
Last active August 8, 2025 19:16
Show Gist options
  • Select an option

  • Save akhildevelops/32571da54d5f52bfc13a20e7e70dce50 to your computer and use it in GitHub Desktop.

Select an option

Save akhildevelops/32571da54d5f52bfc13a20e7e70dce50 to your computer and use it in GitHub Desktop.
Arrow vs Lance
// benchmark on  master [!] is 📦 v0.1.0 via 🦀 v1.88.0 took 3m22s
// ❯ cargo run --release
// Compiling benchmark v0.1.0 (/home/akhil/practice/benchmark)
// Finished `release` profile [optimized] target(s) in 17.35s
// Running `target/release/benchmark`
// Inital memory: 22581248 bytes
// ==========ARROW WRITE===========
// nth record batch: 0,n_rows:1, size:2147508784bytes, memory_consumed: 2171830272bytes, record_batch_generation_duration: 838.356722ms
// nth record: 0, rb_write_time: 1.001391512s
// nth record batch: 1,n_rows:1, size:2147508784bytes, memory_consumed: 2172235776bytes, record_batch_generation_duration: 836.583332ms
// nth record: 1, rb_write_time: 928.580149ms
// nth record batch: 2,n_rows:1, size:2147508784bytes, memory_consumed: 2172235776bytes, record_batch_generation_duration: 940.09933ms
// nth record: 2, rb_write_time: 984.860962ms
// nth record batch: 3,n_rows:130945, size:1076937136bytes, memory_consumed: 1102426112bytes, record_batch_generation_duration: 498.270378ms
// nth record: 3, rb_write_time: 1.073880127s
// nth record batch: 4,n_rows:1, size:2147508784bytes, memory_consumed: 2179395584bytes, record_batch_generation_duration: 821.304245ms
// nth record: 4, rb_write_time: 21.851720346s
// nth record batch: 5,n_rows:1, size:2147508784bytes, memory_consumed: 2179407872bytes, record_batch_generation_duration: 855.267556ms
// nth record: 5, rb_write_time: 29.056329781s
// Arrow write duration: 59.686785414s
// ==========LANCE WRITE===========
// nth record batch: 0,n_rows:1, size:2147508784bytes, memory_consumed: 2181267456bytes, record_batch_generation_duration: 907.662395ms
// nth record batch: 1,n_rows:1, size:2147508784bytes, memory_consumed: 2265063424bytes, record_batch_generation_duration: 898.267797ms
// nth record batch: 2,n_rows:1, size:2147508784bytes, memory_consumed: 2312540160bytes, record_batch_generation_duration: 898.198482ms
// nth record batch: 3,n_rows:130945, size:1076937136bytes, memory_consumed: 1142280192bytes, record_batch_generation_duration: 503.981987ms
// nth record batch: 4,n_rows:1, size:2147508784bytes, memory_consumed: 2429804544bytes, record_batch_generation_duration: 850.564397ms
// nth record batch: 5,n_rows:1, size:2147508784bytes, memory_consumed: 2288320512bytes, record_batch_generation_duration: 855.774209ms
// Lance write duration: 102.396438983s
// ==========ARROW READ===========
// File Read: 25.683µs
// Record Batch Load time: 560.160977ms
// n_rows: 1,size: 12884907312bytes,compute_duration: 803ns,sum of int32: 196652806, null_count0
// Record Batch Load time: 538.233483ms
// n_rows: 1,size: 12884907312bytes,compute_duration: 815ns,sum of int32: -715889302, null_count0
// Record Batch Load time: 531.373464ms
// n_rows: 1,size: 12884907312bytes,compute_duration: 735ns,sum of int32: 1725446998, null_count0
// Record Batch Load time: 268.131638ms
// n_rows: 130945,size: 8607282096bytes,compute_duration: 61.239µs,sum of int32: 392961711, null_count65472
// Record Batch Load time: 534.104225ms
// n_rows: 1,size: 12884907312bytes,compute_duration: 809ns,sum of int32: 879693244, null_count0
// Record Batch Load time: 571.679257ms
// n_rows: 1,size: 12884907312bytes,compute_duration: 863ns,sum of int32: 1858342224, null_count0
// Total Time: 3.024525336s
// ==========LANCE READ===========
// Dataset Open308.351µs
// Dataset Scan102.985µs
// Into Stream1.091916ms
// Record Batch load time:2.667333101s
// n_rows:8192, size:6509734448bytes, compute_duration:6.585µs,sum of int32:-1415543465, null_count:4094
// Record Batch load time:12.735796ms
// n_rows:8192, size:67308080bytes, compute_duration:7.314µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:14.593261ms
// n_rows:8192, size:67308080bytes, compute_duration:12.884µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:14.033046ms
// n_rows:8192, size:67308080bytes, compute_duration:13.227µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:24.553593ms
// n_rows:8192, size:67308080bytes, compute_duration:36.827µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:18.509525ms
// n_rows:8192, size:67308080bytes, compute_duration:11.774µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:26.780241ms
// n_rows:8192, size:67308080bytes, compute_duration:61.934µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:29.11µs
// n_rows:8192, size:67308080bytes, compute_duration:43.043µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:20.501µs
// n_rows:8192, size:67308080bytes, compute_duration:40.965µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:249.998µs
// n_rows:8192, size:67308080bytes, compute_duration:53.652µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:84.029µs
// n_rows:8192, size:67308080bytes, compute_duration:29.864µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:67.604µs
// n_rows:8192, size:67308080bytes, compute_duration:30.566µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:32.38µs
// n_rows:8192, size:67308080bytes, compute_duration:18.087µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:24.724µs
// n_rows:8192, size:67308080bytes, compute_duration:30.16µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:46.43µs
// n_rows:8192, size:67308080bytes, compute_duration:32.321µs,sum of int32:-1146687488, null_count:4096
// Record Batch load time:1.645327717s
// n_rows:8070, size:4361256544bytes, compute_duration:7.041µs,sum of int32:331539498, null_count:4034
// Total Time4.482253226s
//// Dependencies in Cargo.toml
// [package]
// name = "benchmark"
// version = "0.1.0"
// edition = "2024"
// [dependencies]
// arrow = "55"
// flatbuffers = "25.2.10"
// futures = "0.3.31"
// lance = "0.32.0"
// sysinfo = "0.36.1"
// tokio = { version = "1.47.1", features = ["rt-multi-thread"] }
// src/main.rs
#![allow(unused)]
use std::fs::File;
use std::mem::MaybeUninit;
use std::path::PathBuf;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
use std::sync::Mutex;
use arrow::array::Array;
use arrow::array::Float32Builder;
use arrow::array::Int32Array;
use arrow::array::Int32Builder;
use arrow::array::LargeBinaryBuilder;
use arrow::array::LargeStringBuilder;
use arrow::array::RecordBatch;
use arrow::array::RecordBatchIterator;
const TOTAL_BYTES: usize = 10 * 1024 * 1024 * 1024;
use arrow::datatypes::DataType;
use arrow::datatypes::Field;
use arrow::datatypes::Schema;
use arrow::error::ArrowError;
use arrow::ipc::reader::FileReader;
use arrow::ipc::writer::FileWriter;
use futures::StreamExt;
use lance::Dataset;
use lance::dataset::fragment::write::FragmentCreateBuilder;
use lance::datatypes::Schema as LanceSchema;
use sysinfo::Pid;
use sysinfo::Process;
use sysinfo::System;
use tokio::runtime;
#[derive(Clone)]
pub struct Rng {
state: u32,
}
impl Rng {
fn init(seed: u32) -> Self {
Self { state: seed }
}
}
impl Iterator for Rng {
type Item = u32;
fn next(&mut self) -> Option<Self::Item> {
self.state ^= self.state << 13;
self.state ^= self.state >> 17;
self.state ^= self.state << 5;
Some(self.state)
}
}
const BUFFER_LEN: usize = 4096;
const RB_SIZE: usize = 1024 * 1024 * 1024 * 1;
const SEED: u32 = 139849023;
#[derive(Clone)]
struct RBGen {
r: Rng,
total_bytes: usize,
schema: Schema,
process: Arc<Mutex<SysPID>>,
n_rbs: usize,
}
impl Iterator for RBGen {
type Item = Result<RecordBatch, ArrowError>;
fn next(&mut self) -> Option<Self::Item> {
if self.total_bytes >= TOTAL_BYTES {
return None;
}
let start_rb_time = std::time::Instant::now();
let mut int32_builder = Int32Builder::new();
let mut f32_builder = Float32Builder::new();
let mut str_builder = LargeStringBuilder::new();
let mut blob_builder = LargeBinaryBuilder::new();
let value = self.r.next().unwrap();
let mut rb_size = 0;
let mut is_null = true;
while rb_size < RB_SIZE {
if is_null {
int32_builder.append_value(value as i32);
f32_builder.append_value(f32::from_bits(value));
} else {
int32_builder.append_null();
f32_builder.append_null();
}
is_null = !is_null;
rb_size += std::mem::size_of::<u32>();
rb_size += std::mem::size_of::<f32>();
// Create Random String
let mut written_chars: u32 = 0;
let u8_value = (value % 256) as u8;
let n_chars = if value % 2 == 0 {
1024 * 1024 * 1024
} else {
1
};
let mut initial_char = std::cmp::min(std::cmp::max(u8_value, 0x21), 0x7E);
let mut string = String::with_capacity(n_chars as usize);
while written_chars < n_chars {
let mut string_buffer = vec![0x30_u8; BUFFER_LEN];
unsafe {
for char in string_buffer.iter_mut() {
*char = initial_char;
initial_char += 1;
if initial_char == 0x7F {
initial_char = 0x21;
}
}
written_chars += string_buffer.len() as u32;
let string_buffer_str =
str::from_boxed_utf8_unchecked(string_buffer.into_boxed_slice());
string.push_str(&string_buffer_str);
}
}
str_builder.append_value(&string);
rb_size += string.len();
blob_builder.append_value(&string);
rb_size += string.len();
}
self.total_bytes += rb_size;
let int32_arr = int32_builder.finish();
let f32_arr = f32_builder.finish();
let string_arr = str_builder.finish();
let blob_arr = blob_builder.finish();
let rb = RecordBatch::try_new(
Arc::new(self.schema.clone()),
vec![
Arc::new(int32_arr),
Arc::new(f32_arr),
Arc::new(string_arr),
Arc::new(blob_arr),
],
)
.unwrap();
if let Ok(mut process) = self.process.lock() {
process.sys.refresh_all();
//
println!(
"nth record batch: {},n_rows:{}, size:{}bytes, memory_consumed: {}bytes, record_batch_generation_duration: {:?}",
self.n_rbs,
rb.num_rows(),
rb.get_array_memory_size(),
process.get_mem(),
start_rb_time.elapsed()
);
}
self.n_rbs += 1;
Some(Ok(rb))
}
}
fn write_arrow_lance(process: Arc<Mutex<SysPID>>) {
// Custom random generator for transparency
let r = Rng::init(SEED);
// 4 different datatyp columns: int32, f32, LargeUtf8 and LargeBinary
let int32_field = Field::new("int32", DataType::Int32, true);
let f32_field = Field::new("f32", DataType::Float32, true);
let string_field = Field::new("string", DataType::LargeUtf8, true);
let blob_field = Field::new("blob", DataType::LargeBinary, true);
let schema = Schema::new(vec![int32_field, f32_field, string_field, blob_field]);
// Generates Record Batches until 10GB
let rb_gen = RBGen {
r: r,
total_bytes: 0,
schema: schema.clone(),
process: process.clone(),
n_rbs: 0,
};
println!("==========ARROW WRITE===========");
// Arrow Writes
let mut file = File::create("./blob").unwrap();
let mut fw = FileWriter::try_new(&mut file, &schema).unwrap();
let arrow_file_start = std::time::Instant::now();
for (nth, rb) in rb_gen.clone().into_iter().enumerate() {
let arr_start = std::time::Instant::now();
fw.write(&rb.unwrap()).unwrap();
println!(
"nth record: {}, rb_write_time: {:?}",
nth,
arr_start.elapsed()
);
}
fw.finish().unwrap();
println!("Arrow write duration: {:?}\n", arrow_file_start.elapsed());
println!("==========LANCE WRITE===========");
//Lance Writes
let rb_iter = RecordBatchIterator::new(rb_gen, Arc::new(schema.clone()));
let lance_file = PathBuf::from_str("./blob.lance").unwrap();
if lance_file.exists() {
std::fs::remove_dir_all(&lance_file).unwrap();
}
let rt = runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
let lance_file_start = std::time::Instant::now();
rt.block_on(Dataset::write(rb_iter, "./blob.lance", None))
.unwrap();
println!("Lance write duration: {:?}", lance_file_start.elapsed());
}
fn read_arrow() {
let start_read = std::time::Instant::now();
let file = File::open("./blob").unwrap();
let mut fr = FileReader::try_new(file, None).unwrap();
println!("File Read: {:?}", start_read.elapsed());
loop {
let rb_read = std::time::Instant::now();
if let Some(rb_result) = fr.next() {
println!("Record Batch Load time: {:?}", rb_read.elapsed());
let finish = std::time::Instant::now();
let rb = rb_result.unwrap();
let n_rows = rb.num_rows();
let size = rb.get_array_memory_size();
let col = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
let nulls = col.null_count();
let value = arrow::compute::sum(col).unwrap();
let end = finish.elapsed();
println!(
"n_rows: {},size: {}bytes,compute_duration: {:?},sum of int32: {}, null_count{}",
n_rows, size, end, value, nulls
);
} else {
break;
}
}
}
fn read_lance() {
let block = async {
let open_time = std::time::Instant::now();
let dataset = Dataset::open("./blob.lance").await.unwrap();
println!("Dataset Open{:?}", open_time.elapsed());
let scanner_time = std::time::Instant::now();
let scanner = dataset.scan();
println!("Dataset Scan{:?}", scanner_time.elapsed());
let stream_time = std::time::Instant::now();
let mut stream = scanner.try_into_stream().await.unwrap();
println!("Into Stream{:?}", stream_time.elapsed());
loop {
let rb_read = std::time::Instant::now();
if let Some(rb_result) = stream.next().await {
println!("Record Batch load time:{:?}", rb_read.elapsed());
let finish = std::time::Instant::now();
let rb = rb_result.unwrap();
let n_rows = rb.num_rows();
let size = rb.get_array_memory_size();
let col = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
let nulls = col.null_count();
let value = arrow::compute::sum(col).unwrap();
let end = finish.elapsed();
println!(
"n_rows:{}, size:{}bytes, compute_duration:{:?},sum of int32:{}, null_count:{}",
n_rows, size, end, value, nulls
);
} else {
break;
}
}
};
let rt = runtime::Builder::new_multi_thread()
.enable_all()
.build()
.unwrap();
rt.block_on(block);
}
#[derive(Debug)]
struct SysPID {
pid: Pid,
sys: System,
}
impl SysPID {
fn get_mem(&self) -> u64 {
let pid = self.sys.process(self.pid.clone()).unwrap();
pid.memory()
}
}
fn main() {
let current_pid = sysinfo::get_current_pid().expect("Failed to get current PID");
let sys = System::new_all();
let sys_pid = SysPID {
pid: current_pid,
sys: sys,
};
println!("Inital memory: {} bytes", sys_pid.get_mem());
// Write arrow and lance datasets from same source
//// Observed:
//// Arrow write duration: 35.340100258s
//// Lance write duration: 112.428478707s
write_arrow_lance(Arc::new(Mutex::new(sys_pid)));
// File Reads
// Note: Uncomment the inner durations if needed.
// Commented them not to interfere with actual intended operation.
//// Observed:
//// Arrow read: 2.873216313s
//// Lance read: 4.404420162s
//// Read Arrow File
println!("==========ARROW READ===========");
let arrow_t = std::time::Instant::now();
read_arrow();
println!("Total Time: {:?}", arrow_t.elapsed());
//// Read Lance Folder
println!("==========LANCE READ===========");
let lance_t = std::time::Instant::now();
read_lance();
println!("Total Time{:?}", lance_t.elapsed());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment