Created
December 6, 2025 01:28
-
-
Save maxjustus/1650be189c3c678b12d3cc4b9e07ef0b to your computer and use it in GitHub Desktop.
Bench script for new multiSearchAny impl in ClickHoue
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Micro-benchmark: Aho-Corasick (daachorse) vs chunked Volnitsky for large pattern counts. | |
| Generates test data once to a file, then benchmarks queries against that file. | |
| Usage: | |
| python multi_search_bench.py [--rows N] | |
| """ | |
| import subprocess | |
| import sys | |
| import re | |
| import statistics | |
| import tempfile | |
| import os | |
| import argparse | |
| from pathlib import Path | |
| VOLNITSKY_MAX_PATTERNS = 255 | |
| # Use splitByString instead of array literals to avoid AST size explosion. | |
| # Array literals like ['p1', 'p2', ...] create one AST node per element. | |
| # splitByString('$str$', 'p1$str$p2$str$...') keeps the AST small and constructs | |
| # the array at runtime. The $str$ delimiter allows patterns to contain commas. | |
| PATTERN_DELIMITER = "$str$" | |
| def get_clickhouse_binary(): | |
| macbuild_path = Path("/Users/audio/dev/ClickHouse/macbuild/programs/clickhouse") | |
| if macbuild_path.exists(): | |
| return str(macbuild_path) | |
| return "clickhouse" | |
| def create_test_data_file(binary: str, num_rows: int) -> str: | |
| fd, filepath = tempfile.mkstemp(suffix=".parquet", prefix="test_data_") | |
| os.close(fd) | |
| os.remove(filepath) | |
| query = f""" | |
| SELECT * FROM generateRandom('data String') | |
| LIMIT {num_rows} | |
| INTO OUTFILE '{filepath}' | |
| FORMAT Parquet | |
| """ | |
| result = subprocess.run( | |
| [binary, "local", "--query", query], | |
| capture_output=True, text=True, timeout=120 | |
| ) | |
| if result.returncode != 0 or not os.path.exists(filepath): | |
| print(f"Error generating test data: {result.stderr}", file=sys.stderr) | |
| return None | |
| file_size = os.path.getsize(filepath) | |
| print(f"Generated: {filepath} ({file_size:,} bytes, {num_rows:,} rows)") | |
| return filepath | |
| def build_pattern_string(pattern_count: int) -> str: | |
| return PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(pattern_count)]) | |
| def build_chunked_query(data_file: str, pattern_count: int, case_insensitive: bool) -> str: | |
| func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny" | |
| chunks = [] | |
| for start in range(0, pattern_count, VOLNITSKY_MAX_PATTERNS): | |
| end = min(start + VOLNITSKY_MAX_PATTERNS, pattern_count) | |
| pattern_str = PATTERN_DELIMITER.join([f"nomatch_{i}" for i in range(start, end)]) | |
| chunks.append(f"{func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))") | |
| return f"SELECT * FROM file('{data_file}', Parquet) WHERE {' OR '.join(chunks)}" | |
| def run_query(binary: str, query: str, timeout: int = 600): | |
| fd, query_file = tempfile.mkstemp(suffix=".sql", prefix="query_") | |
| os.write(fd, f"{query} FORMAT Null".encode('utf-8')) | |
| os.close(fd) | |
| try: | |
| result = subprocess.run( | |
| [binary, "local", "--time", "--max_query_size=10000000000", "--queries-file", query_file], | |
| capture_output=True, text=True, timeout=timeout | |
| ) | |
| os.remove(query_file) | |
| if result.returncode == 0: | |
| match = re.search(r'^(\d+\.\d+)', result.stderr.strip()) | |
| if match: | |
| return float(match.group(1)) * 1000, None | |
| return None, result.stderr.strip()[:200] if result.stderr else f"Exit code {result.returncode}" | |
| except subprocess.TimeoutExpired: | |
| return None, "TIMEOUT" | |
| except Exception as e: | |
| return None, str(e)[:200] | |
| def benchmark(binary: str, data_file: str, pattern_count: int, iterations: int, case_insensitive: bool): | |
| func = "multiSearchAnyCaseInsensitive" if case_insensitive else "multiSearchAny" | |
| pattern_str = build_pattern_string(pattern_count) | |
| query_single = f"SELECT * FROM file('{data_file}', Parquet) WHERE {func}(data, splitByString('{PATTERN_DELIMITER}', '{pattern_str}'))" | |
| query_chunked = build_chunked_query(data_file, pattern_count, case_insensitive) | |
| # Warm-up | |
| run_query(binary, query_single) | |
| run_query(binary, query_chunked) | |
| aho_times, chunked_times, errors = [], [], {} | |
| for _ in range(iterations): | |
| t, err = run_query(binary, query_single) | |
| if t: | |
| aho_times.append(t) | |
| elif err and "aho" not in errors: | |
| errors["aho"] = err | |
| for _ in range(iterations): | |
| t, err = run_query(binary, query_chunked) | |
| if t: | |
| chunked_times.append(t) | |
| elif err and "chunked" not in errors: | |
| errors["chunked"] = err | |
| return ( | |
| statistics.median(aho_times) if aho_times else None, | |
| statistics.median(chunked_times) if chunked_times else None, | |
| errors | |
| ) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Aho-Corasick vs Volnitsky micro-benchmark") | |
| parser.add_argument("--rows", "-r", type=int, default=100000) | |
| args = parser.parse_args() | |
| binary = get_clickhouse_binary() | |
| print("Aho-Corasick (daachorse) vs Chunked Volnitsky") | |
| print("=" * 75) | |
| data_file = create_test_data_file(binary, num_rows=args.rows) | |
| if not data_file: | |
| sys.exit(1) | |
| pattern_counts = [1000, 10000, 50000, 100000, 200000] | |
| for label, case_insensitive in [("Case-sensitive", False), ("Case-insensitive", True)]: | |
| print(f"\n{label}:") | |
| print(f"{'Patterns':<10} {'Chunks':<8} {'Aho (ms)':<12} {'Chunked (ms)':<14} {'Speedup':<10}") | |
| print("-" * 60) | |
| for pc in pattern_counts: | |
| chunks = (pc + VOLNITSKY_MAX_PATTERNS - 1) // VOLNITSKY_MAX_PATTERNS | |
| aho, chunked, errors = benchmark(binary, data_file, pc, iterations=2, case_insensitive=case_insensitive) | |
| aho_str = f"{aho:.1f}" if aho else "ERR" | |
| chunked_str = f"{chunked:.1f}" if chunked else "ERR" | |
| speedup = f"{chunked/aho:.1f}x" if aho and chunked else "" | |
| print(f"{pc:<10} {chunks:<8} {aho_str:<12} {chunked_str:<14} {speedup:<10}") | |
| for key, err in errors.items(): | |
| print(f" {key}: {err}") | |
| os.remove(data_file) | |
| print(f"\nCleaned up: {data_file}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment