Last active
August 8, 2025 19:16
-
-
Save akhildevelops/32571da54d5f52bfc13a20e7e70dce50 to your computer and use it in GitHub Desktop.
Arrow vs Lance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // benchmark on master [!] is 📦 v0.1.0 via 🦀 v1.88.0 took 3m22s | |
| // ❯ cargo run --release | |
| // Compiling benchmark v0.1.0 (/home/akhil/practice/benchmark) | |
| // Finished `release` profile [optimized] target(s) in 17.35s | |
| // Running `target/release/benchmark` | |
| // Inital memory: 22581248 bytes | |
| // ==========ARROW WRITE=========== | |
| // nth record batch: 0,n_rows:1, size:2147508784bytes, memory_consumed: 2171830272bytes, record_batch_generation_duration: 838.356722ms | |
| // nth record: 0, rb_write_time: 1.001391512s | |
| // nth record batch: 1,n_rows:1, size:2147508784bytes, memory_consumed: 2172235776bytes, record_batch_generation_duration: 836.583332ms | |
| // nth record: 1, rb_write_time: 928.580149ms | |
| // nth record batch: 2,n_rows:1, size:2147508784bytes, memory_consumed: 2172235776bytes, record_batch_generation_duration: 940.09933ms | |
| // nth record: 2, rb_write_time: 984.860962ms | |
| // nth record batch: 3,n_rows:130945, size:1076937136bytes, memory_consumed: 1102426112bytes, record_batch_generation_duration: 498.270378ms | |
| // nth record: 3, rb_write_time: 1.073880127s | |
| // nth record batch: 4,n_rows:1, size:2147508784bytes, memory_consumed: 2179395584bytes, record_batch_generation_duration: 821.304245ms | |
| // nth record: 4, rb_write_time: 21.851720346s | |
| // nth record batch: 5,n_rows:1, size:2147508784bytes, memory_consumed: 2179407872bytes, record_batch_generation_duration: 855.267556ms | |
| // nth record: 5, rb_write_time: 29.056329781s | |
| // Arrow write duration: 59.686785414s | |
| // ==========LANCE WRITE=========== | |
| // nth record batch: 0,n_rows:1, size:2147508784bytes, memory_consumed: 2181267456bytes, record_batch_generation_duration: 907.662395ms | |
| // nth record batch: 1,n_rows:1, size:2147508784bytes, memory_consumed: 2265063424bytes, record_batch_generation_duration: 898.267797ms | |
| // nth record batch: 2,n_rows:1, size:2147508784bytes, memory_consumed: 2312540160bytes, record_batch_generation_duration: 898.198482ms | |
| // nth record batch: 3,n_rows:130945, size:1076937136bytes, memory_consumed: 1142280192bytes, record_batch_generation_duration: 503.981987ms | |
| // nth record batch: 4,n_rows:1, size:2147508784bytes, memory_consumed: 2429804544bytes, record_batch_generation_duration: 850.564397ms | |
| // nth record batch: 5,n_rows:1, size:2147508784bytes, memory_consumed: 2288320512bytes, record_batch_generation_duration: 855.774209ms | |
| // Lance write duration: 102.396438983s | |
| // ==========ARROW READ=========== | |
| // File Read: 25.683µs | |
| // Record Batch Load time: 560.160977ms | |
| // n_rows: 1,size: 12884907312bytes,compute_duration: 803ns,sum of int32: 196652806, null_count0 | |
| // Record Batch Load time: 538.233483ms | |
| // n_rows: 1,size: 12884907312bytes,compute_duration: 815ns,sum of int32: -715889302, null_count0 | |
| // Record Batch Load time: 531.373464ms | |
| // n_rows: 1,size: 12884907312bytes,compute_duration: 735ns,sum of int32: 1725446998, null_count0 | |
| // Record Batch Load time: 268.131638ms | |
| // n_rows: 130945,size: 8607282096bytes,compute_duration: 61.239µs,sum of int32: 392961711, null_count65472 | |
| // Record Batch Load time: 534.104225ms | |
| // n_rows: 1,size: 12884907312bytes,compute_duration: 809ns,sum of int32: 879693244, null_count0 | |
| // Record Batch Load time: 571.679257ms | |
| // n_rows: 1,size: 12884907312bytes,compute_duration: 863ns,sum of int32: 1858342224, null_count0 | |
| // Total Time: 3.024525336s | |
| // ==========LANCE READ=========== | |
| // Dataset Open308.351µs | |
| // Dataset Scan102.985µs | |
| // Into Stream1.091916ms | |
| // Record Batch load time:2.667333101s | |
| // n_rows:8192, size:6509734448bytes, compute_duration:6.585µs,sum of int32:-1415543465, null_count:4094 | |
| // Record Batch load time:12.735796ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:7.314µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:14.593261ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:12.884µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:14.033046ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:13.227µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:24.553593ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:36.827µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:18.509525ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:11.774µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:26.780241ms | |
| // n_rows:8192, size:67308080bytes, compute_duration:61.934µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:29.11µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:43.043µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:20.501µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:40.965µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:249.998µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:53.652µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:84.029µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:29.864µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:67.604µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:30.566µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:32.38µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:18.087µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:24.724µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:30.16µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:46.43µs | |
| // n_rows:8192, size:67308080bytes, compute_duration:32.321µs,sum of int32:-1146687488, null_count:4096 | |
| // Record Batch load time:1.645327717s | |
| // n_rows:8070, size:4361256544bytes, compute_duration:7.041µs,sum of int32:331539498, null_count:4034 | |
| // Total Time4.482253226s | |
| //// Dependencies in Cargo.toml | |
| // [package] | |
| // name = "benchmark" | |
| // version = "0.1.0" | |
| // edition = "2024" | |
| // [dependencies] | |
| // arrow = "55" | |
| // flatbuffers = "25.2.10" | |
| // futures = "0.3.31" | |
| // lance = "0.32.0" | |
| // sysinfo = "0.36.1" | |
| // tokio = { version = "1.47.1", features = ["rt-multi-thread"] } | |
| // src/main.rs | |
| #![allow(unused)] | |
| use std::fs::File; | |
| use std::mem::MaybeUninit; | |
| use std::path::PathBuf; | |
| use std::str; | |
| use std::str::FromStr; | |
| use std::sync::Arc; | |
| use std::sync::Mutex; | |
| use arrow::array::Array; | |
| use arrow::array::Float32Builder; | |
| use arrow::array::Int32Array; | |
| use arrow::array::Int32Builder; | |
| use arrow::array::LargeBinaryBuilder; | |
| use arrow::array::LargeStringBuilder; | |
| use arrow::array::RecordBatch; | |
| use arrow::array::RecordBatchIterator; | |
| const TOTAL_BYTES: usize = 10 * 1024 * 1024 * 1024; | |
| use arrow::datatypes::DataType; | |
| use arrow::datatypes::Field; | |
| use arrow::datatypes::Schema; | |
| use arrow::error::ArrowError; | |
| use arrow::ipc::reader::FileReader; | |
| use arrow::ipc::writer::FileWriter; | |
| use futures::StreamExt; | |
| use lance::Dataset; | |
| use lance::dataset::fragment::write::FragmentCreateBuilder; | |
| use lance::datatypes::Schema as LanceSchema; | |
| use sysinfo::Pid; | |
| use sysinfo::Process; | |
| use sysinfo::System; | |
| use tokio::runtime; | |
| #[derive(Clone)] | |
| pub struct Rng { | |
| state: u32, | |
| } | |
| impl Rng { | |
| fn init(seed: u32) -> Self { | |
| Self { state: seed } | |
| } | |
| } | |
| impl Iterator for Rng { | |
| type Item = u32; | |
| fn next(&mut self) -> Option<Self::Item> { | |
| self.state ^= self.state << 13; | |
| self.state ^= self.state >> 17; | |
| self.state ^= self.state << 5; | |
| Some(self.state) | |
| } | |
| } | |
| const BUFFER_LEN: usize = 4096; | |
| const RB_SIZE: usize = 1024 * 1024 * 1024 * 1; | |
| const SEED: u32 = 139849023; | |
| #[derive(Clone)] | |
| struct RBGen { | |
| r: Rng, | |
| total_bytes: usize, | |
| schema: Schema, | |
| process: Arc<Mutex<SysPID>>, | |
| n_rbs: usize, | |
| } | |
| impl Iterator for RBGen { | |
| type Item = Result<RecordBatch, ArrowError>; | |
| fn next(&mut self) -> Option<Self::Item> { | |
| if self.total_bytes >= TOTAL_BYTES { | |
| return None; | |
| } | |
| let start_rb_time = std::time::Instant::now(); | |
| let mut int32_builder = Int32Builder::new(); | |
| let mut f32_builder = Float32Builder::new(); | |
| let mut str_builder = LargeStringBuilder::new(); | |
| let mut blob_builder = LargeBinaryBuilder::new(); | |
| let value = self.r.next().unwrap(); | |
| let mut rb_size = 0; | |
| let mut is_null = true; | |
| while rb_size < RB_SIZE { | |
| if is_null { | |
| int32_builder.append_value(value as i32); | |
| f32_builder.append_value(f32::from_bits(value)); | |
| } else { | |
| int32_builder.append_null(); | |
| f32_builder.append_null(); | |
| } | |
| is_null = !is_null; | |
| rb_size += std::mem::size_of::<u32>(); | |
| rb_size += std::mem::size_of::<f32>(); | |
| // Create Random String | |
| let mut written_chars: u32 = 0; | |
| let u8_value = (value % 256) as u8; | |
| let n_chars = if value % 2 == 0 { | |
| 1024 * 1024 * 1024 | |
| } else { | |
| 1 | |
| }; | |
| let mut initial_char = std::cmp::min(std::cmp::max(u8_value, 0x21), 0x7E); | |
| let mut string = String::with_capacity(n_chars as usize); | |
| while written_chars < n_chars { | |
| let mut string_buffer = vec![0x30_u8; BUFFER_LEN]; | |
| unsafe { | |
| for char in string_buffer.iter_mut() { | |
| *char = initial_char; | |
| initial_char += 1; | |
| if initial_char == 0x7F { | |
| initial_char = 0x21; | |
| } | |
| } | |
| written_chars += string_buffer.len() as u32; | |
| let string_buffer_str = | |
| str::from_boxed_utf8_unchecked(string_buffer.into_boxed_slice()); | |
| string.push_str(&string_buffer_str); | |
| } | |
| } | |
| str_builder.append_value(&string); | |
| rb_size += string.len(); | |
| blob_builder.append_value(&string); | |
| rb_size += string.len(); | |
| } | |
| self.total_bytes += rb_size; | |
| let int32_arr = int32_builder.finish(); | |
| let f32_arr = f32_builder.finish(); | |
| let string_arr = str_builder.finish(); | |
| let blob_arr = blob_builder.finish(); | |
| let rb = RecordBatch::try_new( | |
| Arc::new(self.schema.clone()), | |
| vec![ | |
| Arc::new(int32_arr), | |
| Arc::new(f32_arr), | |
| Arc::new(string_arr), | |
| Arc::new(blob_arr), | |
| ], | |
| ) | |
| .unwrap(); | |
| if let Ok(mut process) = self.process.lock() { | |
| process.sys.refresh_all(); | |
| // | |
| println!( | |
| "nth record batch: {},n_rows:{}, size:{}bytes, memory_consumed: {}bytes, record_batch_generation_duration: {:?}", | |
| self.n_rbs, | |
| rb.num_rows(), | |
| rb.get_array_memory_size(), | |
| process.get_mem(), | |
| start_rb_time.elapsed() | |
| ); | |
| } | |
| self.n_rbs += 1; | |
| Some(Ok(rb)) | |
| } | |
| } | |
| fn write_arrow_lance(process: Arc<Mutex<SysPID>>) { | |
| // Custom random generator for transparency | |
| let r = Rng::init(SEED); | |
| // 4 different datatyp columns: int32, f32, LargeUtf8 and LargeBinary | |
| let int32_field = Field::new("int32", DataType::Int32, true); | |
| let f32_field = Field::new("f32", DataType::Float32, true); | |
| let string_field = Field::new("string", DataType::LargeUtf8, true); | |
| let blob_field = Field::new("blob", DataType::LargeBinary, true); | |
| let schema = Schema::new(vec![int32_field, f32_field, string_field, blob_field]); | |
| // Generates Record Batches until 10GB | |
| let rb_gen = RBGen { | |
| r: r, | |
| total_bytes: 0, | |
| schema: schema.clone(), | |
| process: process.clone(), | |
| n_rbs: 0, | |
| }; | |
| println!("==========ARROW WRITE==========="); | |
| // Arrow Writes | |
| let mut file = File::create("./blob").unwrap(); | |
| let mut fw = FileWriter::try_new(&mut file, &schema).unwrap(); | |
| let arrow_file_start = std::time::Instant::now(); | |
| for (nth, rb) in rb_gen.clone().into_iter().enumerate() { | |
| let arr_start = std::time::Instant::now(); | |
| fw.write(&rb.unwrap()).unwrap(); | |
| println!( | |
| "nth record: {}, rb_write_time: {:?}", | |
| nth, | |
| arr_start.elapsed() | |
| ); | |
| } | |
| fw.finish().unwrap(); | |
| println!("Arrow write duration: {:?}\n", arrow_file_start.elapsed()); | |
| println!("==========LANCE WRITE==========="); | |
| //Lance Writes | |
| let rb_iter = RecordBatchIterator::new(rb_gen, Arc::new(schema.clone())); | |
| let lance_file = PathBuf::from_str("./blob.lance").unwrap(); | |
| if lance_file.exists() { | |
| std::fs::remove_dir_all(&lance_file).unwrap(); | |
| } | |
| let rt = runtime::Builder::new_multi_thread() | |
| .enable_all() | |
| .build() | |
| .unwrap(); | |
| let lance_file_start = std::time::Instant::now(); | |
| rt.block_on(Dataset::write(rb_iter, "./blob.lance", None)) | |
| .unwrap(); | |
| println!("Lance write duration: {:?}", lance_file_start.elapsed()); | |
| } | |
| fn read_arrow() { | |
| let start_read = std::time::Instant::now(); | |
| let file = File::open("./blob").unwrap(); | |
| let mut fr = FileReader::try_new(file, None).unwrap(); | |
| println!("File Read: {:?}", start_read.elapsed()); | |
| loop { | |
| let rb_read = std::time::Instant::now(); | |
| if let Some(rb_result) = fr.next() { | |
| println!("Record Batch Load time: {:?}", rb_read.elapsed()); | |
| let finish = std::time::Instant::now(); | |
| let rb = rb_result.unwrap(); | |
| let n_rows = rb.num_rows(); | |
| let size = rb.get_array_memory_size(); | |
| let col = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap(); | |
| let nulls = col.null_count(); | |
| let value = arrow::compute::sum(col).unwrap(); | |
| let end = finish.elapsed(); | |
| println!( | |
| "n_rows: {},size: {}bytes,compute_duration: {:?},sum of int32: {}, null_count{}", | |
| n_rows, size, end, value, nulls | |
| ); | |
| } else { | |
| break; | |
| } | |
| } | |
| } | |
| fn read_lance() { | |
| let block = async { | |
| let open_time = std::time::Instant::now(); | |
| let dataset = Dataset::open("./blob.lance").await.unwrap(); | |
| println!("Dataset Open{:?}", open_time.elapsed()); | |
| let scanner_time = std::time::Instant::now(); | |
| let scanner = dataset.scan(); | |
| println!("Dataset Scan{:?}", scanner_time.elapsed()); | |
| let stream_time = std::time::Instant::now(); | |
| let mut stream = scanner.try_into_stream().await.unwrap(); | |
| println!("Into Stream{:?}", stream_time.elapsed()); | |
| loop { | |
| let rb_read = std::time::Instant::now(); | |
| if let Some(rb_result) = stream.next().await { | |
| println!("Record Batch load time:{:?}", rb_read.elapsed()); | |
| let finish = std::time::Instant::now(); | |
| let rb = rb_result.unwrap(); | |
| let n_rows = rb.num_rows(); | |
| let size = rb.get_array_memory_size(); | |
| let col = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap(); | |
| let nulls = col.null_count(); | |
| let value = arrow::compute::sum(col).unwrap(); | |
| let end = finish.elapsed(); | |
| println!( | |
| "n_rows:{}, size:{}bytes, compute_duration:{:?},sum of int32:{}, null_count:{}", | |
| n_rows, size, end, value, nulls | |
| ); | |
| } else { | |
| break; | |
| } | |
| } | |
| }; | |
| let rt = runtime::Builder::new_multi_thread() | |
| .enable_all() | |
| .build() | |
| .unwrap(); | |
| rt.block_on(block); | |
| } | |
| #[derive(Debug)] | |
| struct SysPID { | |
| pid: Pid, | |
| sys: System, | |
| } | |
| impl SysPID { | |
| fn get_mem(&self) -> u64 { | |
| let pid = self.sys.process(self.pid.clone()).unwrap(); | |
| pid.memory() | |
| } | |
| } | |
| fn main() { | |
| let current_pid = sysinfo::get_current_pid().expect("Failed to get current PID"); | |
| let sys = System::new_all(); | |
| let sys_pid = SysPID { | |
| pid: current_pid, | |
| sys: sys, | |
| }; | |
| println!("Inital memory: {} bytes", sys_pid.get_mem()); | |
| // Write arrow and lance datasets from same source | |
| //// Observed: | |
| //// Arrow write duration: 35.340100258s | |
| //// Lance write duration: 112.428478707s | |
| write_arrow_lance(Arc::new(Mutex::new(sys_pid))); | |
| // File Reads | |
| // Note: Uncomment the inner durations if needed. | |
| // Commented them not to interfere with actual intended operation. | |
| //// Observed: | |
| //// Arrow read: 2.873216313s | |
| //// Lance read: 4.404420162s | |
| //// Read Arrow File | |
| println!("==========ARROW READ==========="); | |
| let arrow_t = std::time::Instant::now(); | |
| read_arrow(); | |
| println!("Total Time: {:?}", arrow_t.elapsed()); | |
| //// Read Lance Folder | |
| println!("==========LANCE READ==========="); | |
| let lance_t = std::time::Instant::now(); | |
| read_lance(); | |
| println!("Total Time{:?}", lance_t.elapsed()); | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment