Skip to content

Instantly share code, notes, and snippets.

@maxjustus
Created December 6, 2025 01:28
Show Gist options
  • Select an option

  • Save maxjustus/1650be189c3c678b12d3cc4b9e07ef0b to your computer and use it in GitHub Desktop.

Select an option

Save maxjustus/1650be189c3c678b12d3cc4b9e07ef0b to your computer and use it in GitHub Desktop.
Bench script for new multiSearchAny impl in ClickHoue
#!/usr/bin/env python3
"""
Micro-benchmark: Aho-Corasick (daachorse) vs chunked Volnitsky for large pattern counts.
Generates test data once to a file, then benchmarks queries against that file.
Usage:
python multi_search_bench.py [--rows N]
"""
import subprocess
import sys
import re
import statistics
import tempfile
import os
import argparse
from pathlib import Path
VOLNITSKY_MAX_PATTERNS = 255
# Use splitByString instead of array literals to avoid AST size explosion.
# Array literals like ['p1', 'p2', ...] create one AST node per element.
# splitByString('$str$', 'p1$str$p2$str$...') keeps the AST small and constructs
# the array at runtime. The $str$ delimiter allows patterns to contain commas.
PATTERN_DELIMITER = "$str$"
def get_clickhouse_binary():
macbuild_path = Path("/Users/audio/dev/ClickHouse/macbuild/programs/clickhouse")
if macbuild_path.exists():
return str(macbuild_path)
return "clickhouse"
def create_test_data_file(binary: str, num_rows: int) -> str:
fd, filepath = tempfile.mkstemp(suffix=".parquet", prefix="test_data_")
os.close(fd)
os.remove(filepath)
query = f"""
SELECT * FROM generateRandom('data String')
LIMIT {num_rows}
INTO OUTFILE '{filepath}'
FORMAT Parquet
"""
result = subprocess.run(
[binary, "local", "--query", query],
capture_output=True, text=True, timeout=120
)
if result.returncode != 0 or not os.path.exists(filepath):
print(f"Error generating test data: {result.stderr}", file=sys.stderr)
return None
file_size = os.path.getsize(filepath)
print(f"Generated: {filepath} ({file_size:,} bytes, {num_rows:,} rows)")
return filepath
def build_pattern_string(pattern_count: int) -> str:
return PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(pattern_count)])
def build_chunked_query(data_file: str, pattern_count: int, case_insensitive: bool) -> str:
func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
chunks = []
for start in range(0, pattern_count, VOLNITSKY_MAX_PATTERNS):
end = min(start + VOLNITSKY_MAX_PATTERNS, pattern_count)
pattern_str = PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(start, end)])
chunks.append(f"{func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))")
return f"SELECT * FROM file('{data_file}', Parquet) WHERE {' OR '.join(chunks)}"
def run_query(binary: str, query: str, timeout: int = 600):
fd, query_file = tempfile.mkstemp(suffix=".sql", prefix="query_")
os.write(fd, f"{query} FORMAT Null".encode('utf-8'))
os.close(fd)
try:
result = subprocess.run(
[binary, "local", "--time", "--max_query_size=10000000000", "--queries-file", query_file],
capture_output=True, text=True, timeout=timeout
)
os.remove(query_file)
if result.returncode == 0:
match = re.search(r'^(\d+\.\d+)', result.stderr.strip())
if match:
return float(match.group(1)) * 1000, None
return None, result.stderr.strip()[:200] if result.stderr else f"Exit code {result.returncode}"
except subprocess.TimeoutExpired:
return None, "TIMEOUT"
except Exception as e:
return None, str(e)[:200]
def benchmark(binary: str, data_file: str, pattern_count: int, iterations: int, case_insensitive: bool):
func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny"
pattern_str = build_pattern_string(pattern_count)
query_single = f"SELECT * FROM file('{data_file}', Parquet) WHERE {func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))"
query_chunked = build_chunked_query(data_file, pattern_count, case_insensitive)
# Warm-up
run_query(binary, query_single)
run_query(binary, query_chunked)
aho_times, chunked_times, errors = [], [], {}
for _ in range(iterations):
t, err = run_query(binary, query_single)
if t:
aho_times.append(t)
elif err and "aho" not in errors:
errors["aho"] = err
for _ in range(iterations):
t, err = run_query(binary, query_chunked)
if t:
chunked_times.append(t)
elif err and "chunked" not in errors:
errors["chunked"] = err
return (
statistics.median(aho_times) if aho_times else None,
statistics.median(chunked_times) if chunked_times else None,
errors
)
def main():
parser = argparse.ArgumentParser(description="Aho-Corasick vs Volnitsky micro-benchmark")
parser.add_argument("--rows", "-r", type=int, default=100000)
args = parser.parse_args()
binary = get_clickhouse_binary()
print("Aho-Corasick (daachorse) vs Chunked Volnitsky")
print("=" * 75)
data_file = create_test_data_file(binary, num_rows=args.rows)
if not data_file:
sys.exit(1)
pattern_counts = [1000, 10000, 50000, 100000, 200000]
for label, case_insensitive in [("Case-sensitive", False), ("Case-insensitive", True)]:
print(f"\n{label}:")
print(f"{'Patterns':<10} {'Chunks':<8} {'Aho (ms)':<12} {'Chunked (ms)':<14} {'Speedup':<10}")
print("-" * 60)
for pc in pattern_counts:
chunks = (pc + VOLNITSKY_MAX_PATTERNS - 1) // VOLNITSKY_MAX_PATTERNS
aho, chunked, errors = benchmark(binary, data_file, pc, iterations=2, case_insensitive=case_insensitive)
aho_str = f"{aho:.1f}" if aho else "ERR"
chunked_str = f"{chunked:.1f}" if chunked else "ERR"
speedup = f"{chunked/aho:.1f}x" if aho and chunked else ""
print(f"{pc:<10} {chunks:<8} {aho_str:<12} {chunked_str:<14} {speedup:<10}")
for key, err in errors.items():
print(f" {key}: {err}")
os.remove(data_file)
print(f"\nCleaned up: {data_file}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment