Created
December 3, 2025 00:06
-
-
Save surya501/db2275bbd47a487c63afc377bda02ae4 to your computer and use it in GitHub Desktop.
Annoy vector database benchmark for DINOv2 embeddings (1024D)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # /// script | |
| # requires-python = ">=3.10" | |
| # dependencies = [ | |
| # "annoy==1.17.0", # Note: 1.17.3 has a bug with get_nns_by_vector on macOS ARM64 | |
| # "numpy", | |
| # ] | |
| # /// | |
| """ | |
| Annoy Vector Database Benchmark | |
| This script benchmarks the Annoy library for approximate nearest neighbor search. | |
| It measures storage size, indexing time, and query performance for DINOv2 embeddings. | |
| Usage: | |
| uv run benchmark_annoy.py | |
| What this script does: | |
| 1. Creates random embeddings (simulating DINOv2 output) | |
| 2. Builds an Annoy index with those embeddings | |
| 3. Saves the index to disk and measures file size | |
| 4. Runs query benchmarks to measure search speed | |
| """ | |
| import os | |
| import time | |
| import numpy as np | |
| from annoy import AnnoyIndex | |
| # ============================================================================= | |
| # CONFIGURATION - Modify these values to test different scenarios | |
| # ============================================================================= | |
| NUM_EMBEDDINGS = 300_000 # Number of vectors to index (e.g., 300k images) | |
| EMBEDDING_DIM = 1024 # DINOv2 outputs 1024-dimensional vectors | |
| NUM_TREES = 100 # More trees = better accuracy, larger index, slower build | |
| METRIC = "angular" # "angular" = cosine similarity (best for DINOv2) | |
| INDEX_PATH = "./annoy_index.ann" # Where to save the index file | |
| # ============================================================================= | |
| # HELPER FUNCTIONS | |
| # ============================================================================= | |
| def format_size(size_bytes: int) -> str: | |
| """Convert bytes to human-readable format (KB, MB, GB).""" | |
| for unit in ["B", "KB", "MB", "GB"]: | |
| if size_bytes < 1024: | |
| return f"{size_bytes:.1f} {unit}" | |
| size_bytes /= 1024 | |
| return f"{size_bytes:.1f} TB" | |
| # ============================================================================= | |
| # MAIN BENCHMARK | |
| # ============================================================================= | |
| def main(): | |
| print("=" * 60) | |
| print("Annoy Vector Database Benchmark") | |
| print("=" * 60) | |
| print(f"Embeddings: {NUM_EMBEDDINGS:,}") | |
| print(f"Dimensions: {EMBEDDING_DIM} (DINOv2)") | |
| print(f"Trees: {NUM_TREES}") | |
| print(f"Metric: {METRIC} (cosine similarity)") | |
| print("=" * 60) | |
| # ------------------------------------------------------------------------- | |
| # Step 1: Create the Annoy index | |
| # ------------------------------------------------------------------------- | |
| # The index is initialized with: | |
| # - embedding dimension (must match your vectors) | |
| # - distance metric ("angular" for cosine, "euclidean" for L2) | |
| print("\n[Step 1] Creating Annoy index...") | |
| index = AnnoyIndex(EMBEDDING_DIM, METRIC) | |
| # ------------------------------------------------------------------------- | |
| # Step 2: Add embeddings to the index | |
| # ------------------------------------------------------------------------- | |
| # Each item needs: | |
| # - A unique integer ID (0, 1, 2, ...) | |
| # - A vector (list or numpy array of floats) | |
| # | |
| # In real usage, you'd load embeddings from DINOv2: | |
| # embedding = dino_model.encode(image) | |
| # index.add_item(image_id, embedding) | |
| # | |
| # Here we use random vectors for benchmarking. | |
| # Pre-generate all embeddings at once (much faster than generating one at a time) | |
| print(f"\n[Step 2a] Generating {NUM_EMBEDDINGS:,} random embeddings...") | |
| start_time = time.time() | |
| embeddings = np.random.randn(NUM_EMBEDDINGS, EMBEDDING_DIM).astype(np.float32) | |
| gen_time = time.time() - start_time | |
| print(f" Done! Generation took {gen_time:.1f}s") | |
| # Add embeddings to the index | |
| print(f"\n[Step 2b] Adding embeddings to index...") | |
| start_time = time.time() | |
| progress_interval = NUM_EMBEDDINGS // 10 | |
| for i in range(NUM_EMBEDDINGS): | |
| # Add to index: add_item(id, vector) | |
| # The ID is just an integer - you maintain a separate mapping to metadata | |
| index.add_item(i, embeddings[i].tolist()) | |
| # Progress update every 10% | |
| if (i + 1) % progress_interval == 0: | |
| pct = (i + 1) / NUM_EMBEDDINGS * 100 | |
| elapsed = time.time() - start_time | |
| rate = (i + 1) / elapsed | |
| print(f" [{pct:5.1f}%] Added {i + 1:,} embeddings ({rate:,.0f}/sec)") | |
| add_time = time.time() - start_time | |
| print(f" Done! Adding took {add_time:.1f}s") | |
| # ------------------------------------------------------------------------- | |
| # Step 3: Build the index | |
| # ------------------------------------------------------------------------- | |
| # This creates the tree structure for fast approximate search. | |
| # More trees = better accuracy but slower build and larger file. | |
| # Recommended: 10-100 trees depending on your accuracy needs. | |
| # | |
| # NOTE: After building, you cannot add more items! | |
| print(f"\n[Step 3] Building index with {NUM_TREES} trees...") | |
| start_time = time.time() | |
| index.build(NUM_TREES) | |
| build_time = time.time() - start_time | |
| print(f" Done! Building took {build_time:.1f}s") | |
| # ------------------------------------------------------------------------- | |
| # Step 4: Save the index to disk | |
| # ------------------------------------------------------------------------- | |
| # The index can be saved and loaded later. | |
| # Loading uses memory-mapping, so it's very fast. | |
| print(f"\n[Step 4] Saving index to {INDEX_PATH}...") | |
| index.save(INDEX_PATH) | |
| file_size = os.path.getsize(INDEX_PATH) | |
| print(f" Done! File size: {format_size(file_size)}") | |
| # ------------------------------------------------------------------------- | |
| # Step 5: Query benchmark | |
| # ------------------------------------------------------------------------- | |
| # get_nns_by_vector(vector, n) returns the n nearest neighbor IDs | |
| # | |
| # Options: | |
| # - include_distances=True: also return distances | |
| # - search_k=-1: how many nodes to inspect (higher = more accurate, slower) | |
| # | |
| # NOTE: This returns only INTEGER IDs, not the actual data! | |
| # You need a separate dict/database to map IDs to metadata. | |
| print("\n[Step 5] Query benchmark (100 queries, top-10 results)...") | |
| query_times = [] | |
| for _ in range(100): | |
| # Create a random query vector (in real usage, this is the DINOv2 embedding of your query image) | |
| query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist() | |
| # Time the search | |
| start_time = time.time() | |
| # Find 10 nearest neighbors | |
| # Returns: [id1, id2, id3, ...] (just the integer IDs!) | |
| nearest_ids = index.get_nns_by_vector(query_vector, 10) | |
| query_times.append((time.time() - start_time) * 1000) # Convert to ms | |
| avg_query_ms = sum(query_times) / len(query_times) | |
| qps = 1000 / avg_query_ms # Queries per second | |
| # ------------------------------------------------------------------------- | |
| # Example: Getting results with distances | |
| # ------------------------------------------------------------------------- | |
| print("\n[Example] Query with distances:") | |
| query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist() | |
| ids, distances = index.get_nns_by_vector(query_vector, 5, include_distances=True) | |
| print(f" Top 5 nearest IDs: {ids}") | |
| print(f" Distances: {[f'{d:.4f}' for d in distances]}") | |
| # ------------------------------------------------------------------------- | |
| # Understanding search_k | |
| # ------------------------------------------------------------------------- | |
| # search_k controls how many tree nodes to inspect during search. | |
| # | |
| # How Annoy works: | |
| # - Each tree splits the vector space into regions using random hyperplanes | |
| # - At query time, Annoy traverses each tree to find candidate neighbors | |
| # - search_k = total number of candidates to inspect across all trees | |
| # | |
| # Default: search_k = n_trees * n_results (e.g., 30 trees * 10 results = 300) | |
| # | |
| # Trade-off: | |
| # - Higher search_k = more candidates checked = better accuracy, slower | |
| # - Lower search_k = fewer candidates = faster, may miss true neighbors | |
| # | |
| # You can tune this at query time without rebuilding the index! | |
| print("\n[Example] Effect of search_k on query time:") | |
| query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist() | |
| n_results = 10 | |
| for search_k in [-1, 100, 1000, 10000]: | |
| times = [] | |
| for _ in range(50): | |
| start = time.time() | |
| ids = index.get_nns_by_vector(query_vector, n_results, search_k=search_k) | |
| times.append((time.time() - start) * 1000) | |
| avg_ms = sum(times) / len(times) | |
| label = "default" if search_k == -1 else str(search_k) | |
| print(f" search_k={label:>7}: {avg_ms:.3f}ms") | |
| # ------------------------------------------------------------------------- | |
| # Summary | |
| # ------------------------------------------------------------------------- | |
| print("\n" + "=" * 60) | |
| print("RESULTS SUMMARY") | |
| print("=" * 60) | |
| print(f"Embeddings: {NUM_EMBEDDINGS:,}") | |
| print(f"Dimensions: {EMBEDDING_DIM}") | |
| print(f"Trees: {NUM_TREES}") | |
| print(f"File size: {format_size(file_size)} ({file_size / NUM_EMBEDDINGS:.1f} bytes/embedding)") | |
| print(f"Index time: {add_time + build_time:.1f}s (add: {add_time:.1f}s, build: {build_time:.1f}s)") | |
| print(f"Avg query time: {avg_query_ms:.3f}ms") | |
| print(f"Throughput: {qps:,.0f} queries/sec") | |
| print("=" * 60) | |
| # ------------------------------------------------------------------------- | |
| # How to load the index later | |
| # ------------------------------------------------------------------------- | |
| print("\n[Tip] To load the index later:") | |
| print(f""" | |
| from annoy import AnnoyIndex | |
| index = AnnoyIndex({EMBEDDING_DIM}, '{METRIC}') | |
| index.load('{INDEX_PATH}') # Memory-mapped, very fast! | |
| # Query | |
| query_embedding = dino_model.encode(my_image) | |
| nearest_ids = index.get_nns_by_vector(query_embedding, 10) | |
| # Look up your metadata | |
| results = [my_metadata[id] for id in nearest_ids] | |
| """) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment