Skip to content

Instantly share code, notes, and snippets.

@surya501
Created December 3, 2025 00:06
Show Gist options
  • Select an option

  • Save surya501/db2275bbd47a487c63afc377bda02ae4 to your computer and use it in GitHub Desktop.

Select an option

Save surya501/db2275bbd47a487c63afc377bda02ae4 to your computer and use it in GitHub Desktop.
Annoy vector database benchmark for DINOv2 embeddings (1024D)
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "annoy==1.17.0", # Note: 1.17.3 has a bug with get_nns_by_vector on macOS ARM64
# "numpy",
# ]
# ///
"""
Annoy Vector Database Benchmark
This script benchmarks the Annoy library for approximate nearest neighbor search.
It measures storage size, indexing time, and query performance for DINOv2 embeddings.
Usage:
uv run benchmark_annoy.py
What this script does:
1. Creates random embeddings (simulating DINOv2 output)
2. Builds an Annoy index with those embeddings
3. Saves the index to disk and measures file size
4. Runs query benchmarks to measure search speed
"""
import os
import time
import numpy as np
from annoy import AnnoyIndex
# =============================================================================
# CONFIGURATION - Modify these values to test different scenarios
# =============================================================================
NUM_EMBEDDINGS = 300_000 # Number of vectors to index (e.g., 300k images)
EMBEDDING_DIM = 1024 # DINOv2 outputs 1024-dimensional vectors
NUM_TREES = 100 # More trees = better accuracy, larger index, slower build
METRIC = "angular" # "angular" = cosine similarity (best for DINOv2)
INDEX_PATH = "./annoy_index.ann" # Where to save the index file
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def format_size(size_bytes: int) -> str:
"""Convert bytes to human-readable format (KB, MB, GB)."""
for unit in ["B", "KB", "MB", "GB"]:
if size_bytes < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} TB"
# =============================================================================
# MAIN BENCHMARK
# =============================================================================
def main():
print("=" * 60)
print("Annoy Vector Database Benchmark")
print("=" * 60)
print(f"Embeddings: {NUM_EMBEDDINGS:,}")
print(f"Dimensions: {EMBEDDING_DIM} (DINOv2)")
print(f"Trees: {NUM_TREES}")
print(f"Metric: {METRIC} (cosine similarity)")
print("=" * 60)
# -------------------------------------------------------------------------
# Step 1: Create the Annoy index
# -------------------------------------------------------------------------
# The index is initialized with:
# - embedding dimension (must match your vectors)
# - distance metric ("angular" for cosine, "euclidean" for L2)
print("\n[Step 1] Creating Annoy index...")
index = AnnoyIndex(EMBEDDING_DIM, METRIC)
# -------------------------------------------------------------------------
# Step 2: Add embeddings to the index
# -------------------------------------------------------------------------
# Each item needs:
# - A unique integer ID (0, 1, 2, ...)
# - A vector (list or numpy array of floats)
#
# In real usage, you'd load embeddings from DINOv2:
# embedding = dino_model.encode(image)
# index.add_item(image_id, embedding)
#
# Here we use random vectors for benchmarking.
# Pre-generate all embeddings at once (much faster than generating one at a time)
print(f"\n[Step 2a] Generating {NUM_EMBEDDINGS:,} random embeddings...")
start_time = time.time()
embeddings = np.random.randn(NUM_EMBEDDINGS, EMBEDDING_DIM).astype(np.float32)
gen_time = time.time() - start_time
print(f" Done! Generation took {gen_time:.1f}s")
# Add embeddings to the index
print(f"\n[Step 2b] Adding embeddings to index...")
start_time = time.time()
progress_interval = NUM_EMBEDDINGS // 10
for i in range(NUM_EMBEDDINGS):
# Add to index: add_item(id, vector)
# The ID is just an integer - you maintain a separate mapping to metadata
index.add_item(i, embeddings[i].tolist())
# Progress update every 10%
if (i + 1) % progress_interval == 0:
pct = (i + 1) / NUM_EMBEDDINGS * 100
elapsed = time.time() - start_time
rate = (i + 1) / elapsed
print(f" [{pct:5.1f}%] Added {i + 1:,} embeddings ({rate:,.0f}/sec)")
add_time = time.time() - start_time
print(f" Done! Adding took {add_time:.1f}s")
# -------------------------------------------------------------------------
# Step 3: Build the index
# -------------------------------------------------------------------------
# This creates the tree structure for fast approximate search.
# More trees = better accuracy but slower build and larger file.
# Recommended: 10-100 trees depending on your accuracy needs.
#
# NOTE: After building, you cannot add more items!
print(f"\n[Step 3] Building index with {NUM_TREES} trees...")
start_time = time.time()
index.build(NUM_TREES)
build_time = time.time() - start_time
print(f" Done! Building took {build_time:.1f}s")
# -------------------------------------------------------------------------
# Step 4: Save the index to disk
# -------------------------------------------------------------------------
# The index can be saved and loaded later.
# Loading uses memory-mapping, so it's very fast.
print(f"\n[Step 4] Saving index to {INDEX_PATH}...")
index.save(INDEX_PATH)
file_size = os.path.getsize(INDEX_PATH)
print(f" Done! File size: {format_size(file_size)}")
# -------------------------------------------------------------------------
# Step 5: Query benchmark
# -------------------------------------------------------------------------
# get_nns_by_vector(vector, n) returns the n nearest neighbor IDs
#
# Options:
# - include_distances=True: also return distances
# - search_k=-1: how many nodes to inspect (higher = more accurate, slower)
#
# NOTE: This returns only INTEGER IDs, not the actual data!
# You need a separate dict/database to map IDs to metadata.
print("\n[Step 5] Query benchmark (100 queries, top-10 results)...")
query_times = []
for _ in range(100):
# Create a random query vector (in real usage, this is the DINOv2 embedding of your query image)
query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
# Time the search
start_time = time.time()
# Find 10 nearest neighbors
# Returns: [id1, id2, id3, ...] (just the integer IDs!)
nearest_ids = index.get_nns_by_vector(query_vector, 10)
query_times.append((time.time() - start_time) * 1000) # Convert to ms
avg_query_ms = sum(query_times) / len(query_times)
qps = 1000 / avg_query_ms # Queries per second
# -------------------------------------------------------------------------
# Example: Getting results with distances
# -------------------------------------------------------------------------
print("\n[Example] Query with distances:")
query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
ids, distances = index.get_nns_by_vector(query_vector, 5, include_distances=True)
print(f" Top 5 nearest IDs: {ids}")
print(f" Distances: {[f'{d:.4f}' for d in distances]}")
# -------------------------------------------------------------------------
# Understanding search_k
# -------------------------------------------------------------------------
# search_k controls how many tree nodes to inspect during search.
#
# How Annoy works:
# - Each tree splits the vector space into regions using random hyperplanes
# - At query time, Annoy traverses each tree to find candidate neighbors
# - search_k = total number of candidates to inspect across all trees
#
# Default: search_k = n_trees * n_results (e.g., 30 trees * 10 results = 300)
#
# Trade-off:
# - Higher search_k = more candidates checked = better accuracy, slower
# - Lower search_k = fewer candidates = faster, may miss true neighbors
#
# You can tune this at query time without rebuilding the index!
print("\n[Example] Effect of search_k on query time:")
query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
n_results = 10
for search_k in [-1, 100, 1000, 10000]:
times = []
for _ in range(50):
start = time.time()
ids = index.get_nns_by_vector(query_vector, n_results, search_k=search_k)
times.append((time.time() - start) * 1000)
avg_ms = sum(times) / len(times)
label = "default" if search_k == -1 else str(search_k)
print(f" search_k={label:>7}: {avg_ms:.3f}ms")
# -------------------------------------------------------------------------
# Summary
# -------------------------------------------------------------------------
print("\n" + "=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)
print(f"Embeddings: {NUM_EMBEDDINGS:,}")
print(f"Dimensions: {EMBEDDING_DIM}")
print(f"Trees: {NUM_TREES}")
print(f"File size: {format_size(file_size)} ({file_size / NUM_EMBEDDINGS:.1f} bytes/embedding)")
print(f"Index time: {add_time + build_time:.1f}s (add: {add_time:.1f}s, build: {build_time:.1f}s)")
print(f"Avg query time: {avg_query_ms:.3f}ms")
print(f"Throughput: {qps:,.0f} queries/sec")
print("=" * 60)
# -------------------------------------------------------------------------
# How to load the index later
# -------------------------------------------------------------------------
print("\n[Tip] To load the index later:")
print(f"""
from annoy import AnnoyIndex
index = AnnoyIndex({EMBEDDING_DIM}, '{METRIC}')
index.load('{INDEX_PATH}') # Memory-mapped, very fast!
# Query
query_embedding = dino_model.encode(my_image)
nearest_ids = index.get_nns_by_vector(query_embedding, 10)
# Look up your metadata
results = [my_metadata[id] for id in nearest_ids]
""")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment