surya501 · December 3, 2025 00:06
diff --git a/benchmark_annoy.py b/benchmark_annoy.py
 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.10"
 # dependencies = [
 #     "annoy==1.17.0",  # Note: 1.17.3 has a bug with get_nns_by_vector on macOS ARM64
 #     "numpy",
 # ]
 # ///
 """
 Annoy Vector Database Benchmark

 This script benchmarks the Annoy library for approximate nearest neighbor search.
 It measures storage size, indexing time, and query performance for DINOv2 embeddings.

 Usage:
    uv run benchmark_annoy.py

 What this script does:
    1. Creates random embeddings (simulating DINOv2 output)
    2. Builds an Annoy index with those embeddings
    3. Saves the index to disk and measures file size
    4. Runs query benchmarks to measure search speed
 """

 import os
 import time
 import numpy as np
 from annoy import AnnoyIndex


 # =============================================================================
 # CONFIGURATION - Modify these values to test different scenarios
 # =============================================================================

 NUM_EMBEDDINGS = 300_000      # Number of vectors to index (e.g., 300k images)
 EMBEDDING_DIM = 1024          # DINOv2 outputs 1024-dimensional vectors
 NUM_TREES = 100               # More trees = better accuracy, larger index, slower build
 METRIC = "angular"            # "angular" = cosine similarity (best for DINOv2)
 INDEX_PATH = "./annoy_index.ann"  # Where to save the index file


 # =============================================================================
 # HELPER FUNCTIONS
 # =============================================================================

 def format_size(size_bytes: int) -> str:
    """Convert bytes to human-readable format (KB, MB, GB)."""
    for unit in ["B", "KB", "MB", "GB"]:
        if size_bytes < 1024:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.1f} TB"


 # =============================================================================
 # MAIN BENCHMARK
 # =============================================================================

 def main():
    print("=" * 60)
    print("Annoy Vector Database Benchmark")
    print("=" * 60)
    print(f"Embeddings:  {NUM_EMBEDDINGS:,}")
    print(f"Dimensions:  {EMBEDDING_DIM} (DINOv2)")
    print(f"Trees:       {NUM_TREES}")
    print(f"Metric:      {METRIC} (cosine similarity)")
    print("=" * 60)

    # -------------------------------------------------------------------------
    # Step 1: Create the Annoy index
    # -------------------------------------------------------------------------
    # The index is initialized with:
    #   - embedding dimension (must match your vectors)
    #   - distance metric ("angular" for cosine, "euclidean" for L2)

    print("\n[Step 1] Creating Annoy index...")
    index = AnnoyIndex(EMBEDDING_DIM, METRIC)

    # -------------------------------------------------------------------------
    # Step 2: Add embeddings to the index
    # -------------------------------------------------------------------------
    # Each item needs:
    #   - A unique integer ID (0, 1, 2, ...)
    #   - A vector (list or numpy array of floats)
    #
    # In real usage, you'd load embeddings from DINOv2:
    #   embedding = dino_model.encode(image)
    #   index.add_item(image_id, embedding)
    #
    # Here we use random vectors for benchmarking.

    # Pre-generate all embeddings at once (much faster than generating one at a time)
    print(f"\n[Step 2a] Generating {NUM_EMBEDDINGS:,} random embeddings...")
    start_time = time.time()
    embeddings = np.random.randn(NUM_EMBEDDINGS, EMBEDDING_DIM).astype(np.float32)
    gen_time = time.time() - start_time
    print(f"  Done! Generation took {gen_time:.1f}s")

    # Add embeddings to the index
    print(f"\n[Step 2b] Adding embeddings to index...")
    start_time = time.time()
    progress_interval = NUM_EMBEDDINGS // 10

    for i in range(NUM_EMBEDDINGS):
        # Add to index: add_item(id, vector)
        # The ID is just an integer - you maintain a separate mapping to metadata
        index.add_item(i, embeddings[i].tolist())

        # Progress update every 10%
        if (i + 1) % progress_interval == 0:
            pct = (i + 1) / NUM_EMBEDDINGS * 100
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed
            print(f"  [{pct:5.1f}%] Added {i + 1:,} embeddings ({rate:,.0f}/sec)")

    add_time = time.time() - start_time
    print(f"  Done! Adding took {add_time:.1f}s")

    # -------------------------------------------------------------------------
    # Step 3: Build the index
    # -------------------------------------------------------------------------
    # This creates the tree structure for fast approximate search.
    # More trees = better accuracy but slower build and larger file.
    # Recommended: 10-100 trees depending on your accuracy needs.
    #
    # NOTE: After building, you cannot add more items!

    print(f"\n[Step 3] Building index with {NUM_TREES} trees...")
    start_time = time.time()
    index.build(NUM_TREES)
    build_time = time.time() - start_time
    print(f"  Done! Building took {build_time:.1f}s")

    # -------------------------------------------------------------------------
    # Step 4: Save the index to disk
    # -------------------------------------------------------------------------
    # The index can be saved and loaded later.
    # Loading uses memory-mapping, so it's very fast.

    print(f"\n[Step 4] Saving index to {INDEX_PATH}...")
    index.save(INDEX_PATH)
    file_size = os.path.getsize(INDEX_PATH)
    print(f"  Done! File size: {format_size(file_size)}")

    # -------------------------------------------------------------------------
    # Step 5: Query benchmark
    # -------------------------------------------------------------------------
    # get_nns_by_vector(vector, n) returns the n nearest neighbor IDs
    #
    # Options:
    #   - include_distances=True: also return distances
    #   - search_k=-1: how many nodes to inspect (higher = more accurate, slower)
    #
    # NOTE: This returns only INTEGER IDs, not the actual data!
    # You need a separate dict/database to map IDs to metadata.

    print("\n[Step 5] Query benchmark (100 queries, top-10 results)...")

    query_times = []
    for _ in range(100):
        # Create a random query vector (in real usage, this is the DINOv2 embedding of your query image)
        query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()

        # Time the search
        start_time = time.time()

        # Find 10 nearest neighbors
        # Returns: [id1, id2, id3, ...] (just the integer IDs!)
        nearest_ids = index.get_nns_by_vector(query_vector, 10)

        query_times.append((time.time() - start_time) * 1000)  # Convert to ms

    avg_query_ms = sum(query_times) / len(query_times)
    qps = 1000 / avg_query_ms  # Queries per second

    # -------------------------------------------------------------------------
    # Example: Getting results with distances
    # -------------------------------------------------------------------------
    print("\n[Example] Query with distances:")
    query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
    ids, distances = index.get_nns_by_vector(query_vector, 5, include_distances=True)
    print(f"  Top 5 nearest IDs: {ids}")
    print(f"  Distances:         {[f'{d:.4f}' for d in distances]}")

    # -------------------------------------------------------------------------
    # Understanding search_k
    # -------------------------------------------------------------------------
    # search_k controls how many tree nodes to inspect during search.
    #
    # How Annoy works:
    #   - Each tree splits the vector space into regions using random hyperplanes
    #   - At query time, Annoy traverses each tree to find candidate neighbors
    #   - search_k = total number of candidates to inspect across all trees
    #
    # Default: search_k = n_trees * n_results (e.g., 30 trees * 10 results = 300)
    #
    # Trade-off:
    #   - Higher search_k = more candidates checked = better accuracy, slower
    #   - Lower search_k  = fewer candidates = faster, may miss true neighbors
    #
    # You can tune this at query time without rebuilding the index!

    print("\n[Example] Effect of search_k on query time:")
    query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
    n_results = 10

    for search_k in [-1, 100, 1000, 10000]:
        times = []
        for _ in range(50):
            start = time.time()
            ids = index.get_nns_by_vector(query_vector, n_results, search_k=search_k)
            times.append((time.time() - start) * 1000)
        avg_ms = sum(times) / len(times)
        label = "default" if search_k == -1 else str(search_k)
        print(f"  search_k={label:>7}: {avg_ms:.3f}ms")

    # -------------------------------------------------------------------------
    # Summary
    # -------------------------------------------------------------------------
    print("\n" + "=" * 60)
    print("RESULTS SUMMARY")
    print("=" * 60)
    print(f"Embeddings:      {NUM_EMBEDDINGS:,}")
    print(f"Dimensions:      {EMBEDDING_DIM}")
    print(f"Trees:           {NUM_TREES}")
    print(f"File size:       {format_size(file_size)} ({file_size / NUM_EMBEDDINGS:.1f} bytes/embedding)")
    print(f"Index time:      {add_time + build_time:.1f}s (add: {add_time:.1f}s, build: {build_time:.1f}s)")
    print(f"Avg query time:  {avg_query_ms:.3f}ms")
    print(f"Throughput:      {qps:,.0f} queries/sec")
    print("=" * 60)

    # -------------------------------------------------------------------------
    # How to load the index later
    # -------------------------------------------------------------------------
    print("\n[Tip] To load the index later:")
    print(f"""
    from annoy import AnnoyIndex

    index = AnnoyIndex({EMBEDDING_DIM}, '{METRIC}')
    index.load('{INDEX_PATH}')  # Memory-mapped, very fast!

    # Query
    query_embedding = dino_model.encode(my_image)
    nearest_ids = index.get_nns_by_vector(query_embedding, 10)

    # Look up your metadata
    results = [my_metadata[id] for id in nearest_ids]
    """)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "annoy==1.17.0", # Note: 1.17.3 has a bug with get_nns_by_vector on macOS ARM64
	# "numpy",
	# ]
	# ///
	"""
	Annoy Vector Database Benchmark

	This script benchmarks the Annoy library for approximate nearest neighbor search.
	It measures storage size, indexing time, and query performance for DINOv2 embeddings.

	Usage:
	uv run benchmark_annoy.py

	What this script does:
	1. Creates random embeddings (simulating DINOv2 output)
	2. Builds an Annoy index with those embeddings
	3. Saves the index to disk and measures file size
	4. Runs query benchmarks to measure search speed
	"""

	import os
	import time
	import numpy as np
	from annoy import AnnoyIndex


	# =============================================================================
	# CONFIGURATION - Modify these values to test different scenarios
	# =============================================================================

	NUM_EMBEDDINGS = 300_000 # Number of vectors to index (e.g., 300k images)
	EMBEDDING_DIM = 1024 # DINOv2 outputs 1024-dimensional vectors
	NUM_TREES = 100 # More trees = better accuracy, larger index, slower build
	METRIC = "angular" # "angular" = cosine similarity (best for DINOv2)
	INDEX_PATH = "./annoy_index.ann" # Where to save the index file


	# =============================================================================
	# HELPER FUNCTIONS
	# =============================================================================

	def format_size(size_bytes: int) -> str:
	"""Convert bytes to human-readable format (KB, MB, GB)."""
	for unit in ["B", "KB", "MB", "GB"]:
	if size_bytes < 1024:
	return f"{size_bytes:.1f} {unit}"
	size_bytes /= 1024
	return f"{size_bytes:.1f} TB"


	# =============================================================================
	# MAIN BENCHMARK
	# =============================================================================

	def main():
	print("=" * 60)
	print("Annoy Vector Database Benchmark")
	print("=" * 60)
	print(f"Embeddings: {NUM_EMBEDDINGS:,}")
	print(f"Dimensions: {EMBEDDING_DIM} (DINOv2)")
	print(f"Trees: {NUM_TREES}")
	print(f"Metric: {METRIC} (cosine similarity)")
	print("=" * 60)

	# -------------------------------------------------------------------------
	# Step 1: Create the Annoy index
	# -------------------------------------------------------------------------
	# The index is initialized with:
	# - embedding dimension (must match your vectors)
	# - distance metric ("angular" for cosine, "euclidean" for L2)

	print("\n[Step 1] Creating Annoy index...")
	index = AnnoyIndex(EMBEDDING_DIM, METRIC)

	# -------------------------------------------------------------------------
	# Step 2: Add embeddings to the index
	# -------------------------------------------------------------------------
	# Each item needs:
	# - A unique integer ID (0, 1, 2, ...)
	# - A vector (list or numpy array of floats)
	#
	# In real usage, you'd load embeddings from DINOv2:
	# embedding = dino_model.encode(image)
	# index.add_item(image_id, embedding)
	#
	# Here we use random vectors for benchmarking.

	# Pre-generate all embeddings at once (much faster than generating one at a time)
	print(f"\n[Step 2a] Generating {NUM_EMBEDDINGS:,} random embeddings...")
	start_time = time.time()
	embeddings = np.random.randn(NUM_EMBEDDINGS, EMBEDDING_DIM).astype(np.float32)
	gen_time = time.time() - start_time
	print(f" Done! Generation took {gen_time:.1f}s")

	# Add embeddings to the index
	print(f"\n[Step 2b] Adding embeddings to index...")
	start_time = time.time()
	progress_interval = NUM_EMBEDDINGS // 10

	for i in range(NUM_EMBEDDINGS):
	# Add to index: add_item(id, vector)
	# The ID is just an integer - you maintain a separate mapping to metadata
	index.add_item(i, embeddings[i].tolist())

	# Progress update every 10%
	if (i + 1) % progress_interval == 0:
	pct = (i + 1) / NUM_EMBEDDINGS * 100
	elapsed = time.time() - start_time
	rate = (i + 1) / elapsed
	print(f" [{pct:5.1f}%] Added {i + 1:,} embeddings ({rate:,.0f}/sec)")

	add_time = time.time() - start_time
	print(f" Done! Adding took {add_time:.1f}s")

	# -------------------------------------------------------------------------
	# Step 3: Build the index
	# -------------------------------------------------------------------------
	# This creates the tree structure for fast approximate search.
	# More trees = better accuracy but slower build and larger file.
	# Recommended: 10-100 trees depending on your accuracy needs.
	#
	# NOTE: After building, you cannot add more items!

	print(f"\n[Step 3] Building index with {NUM_TREES} trees...")
	start_time = time.time()
	index.build(NUM_TREES)
	build_time = time.time() - start_time
	print(f" Done! Building took {build_time:.1f}s")

	# -------------------------------------------------------------------------
	# Step 4: Save the index to disk
	# -------------------------------------------------------------------------
	# The index can be saved and loaded later.
	# Loading uses memory-mapping, so it's very fast.

	print(f"\n[Step 4] Saving index to {INDEX_PATH}...")
	index.save(INDEX_PATH)
	file_size = os.path.getsize(INDEX_PATH)
	print(f" Done! File size: {format_size(file_size)}")

	# -------------------------------------------------------------------------
	# Step 5: Query benchmark
	# -------------------------------------------------------------------------
	# get_nns_by_vector(vector, n) returns the n nearest neighbor IDs
	#
	# Options:
	# - include_distances=True: also return distances
	# - search_k=-1: how many nodes to inspect (higher = more accurate, slower)
	#
	# NOTE: This returns only INTEGER IDs, not the actual data!
	# You need a separate dict/database to map IDs to metadata.

	print("\n[Step 5] Query benchmark (100 queries, top-10 results)...")

	query_times = []
	for _ in range(100):
	# Create a random query vector (in real usage, this is the DINOv2 embedding of your query image)
	query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()

	# Time the search
	start_time = time.time()

	# Find 10 nearest neighbors
	# Returns: [id1, id2, id3, ...] (just the integer IDs!)
	nearest_ids = index.get_nns_by_vector(query_vector, 10)

	query_times.append((time.time() - start_time) * 1000) # Convert to ms

	avg_query_ms = sum(query_times) / len(query_times)
	qps = 1000 / avg_query_ms # Queries per second

	# -------------------------------------------------------------------------
	# Example: Getting results with distances
	# -------------------------------------------------------------------------
	print("\n[Example] Query with distances:")
	query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
	ids, distances = index.get_nns_by_vector(query_vector, 5, include_distances=True)
	print(f" Top 5 nearest IDs: {ids}")
	print(f" Distances: {[f'{d:.4f}' for d in distances]}")

	# -------------------------------------------------------------------------
	# Understanding search_k
	# -------------------------------------------------------------------------
	# search_k controls how many tree nodes to inspect during search.
	#
	# How Annoy works:
	# - Each tree splits the vector space into regions using random hyperplanes
	# - At query time, Annoy traverses each tree to find candidate neighbors
	# - search_k = total number of candidates to inspect across all trees
	#
	# Default: search_k = n_trees * n_results (e.g., 30 trees * 10 results = 300)
	#
	# Trade-off:
	# - Higher search_k = more candidates checked = better accuracy, slower
	# - Lower search_k = fewer candidates = faster, may miss true neighbors
	#
	# You can tune this at query time without rebuilding the index!

	print("\n[Example] Effect of search_k on query time:")
	query_vector = np.random.randn(EMBEDDING_DIM).astype(np.float32).tolist()
	n_results = 10

	for search_k in [-1, 100, 1000, 10000]:
	times = []
	for _ in range(50):
	start = time.time()
	ids = index.get_nns_by_vector(query_vector, n_results, search_k=search_k)
	times.append((time.time() - start) * 1000)
	avg_ms = sum(times) / len(times)
	label = "default" if search_k == -1 else str(search_k)
	print(f" search_k={label:>7}: {avg_ms:.3f}ms")

	# -------------------------------------------------------------------------
	# Summary
	# -------------------------------------------------------------------------
	print("\n" + "=" * 60)
	print("RESULTS SUMMARY")
	print("=" * 60)
	print(f"Embeddings: {NUM_EMBEDDINGS:,}")
	print(f"Dimensions: {EMBEDDING_DIM}")
	print(f"Trees: {NUM_TREES}")
	print(f"File size: {format_size(file_size)} ({file_size / NUM_EMBEDDINGS:.1f} bytes/embedding)")
	print(f"Index time: {add_time + build_time:.1f}s (add: {add_time:.1f}s, build: {build_time:.1f}s)")
	print(f"Avg query time: {avg_query_ms:.3f}ms")
	print(f"Throughput: {qps:,.0f} queries/sec")
	print("=" * 60)

	# -------------------------------------------------------------------------
	# How to load the index later
	# -------------------------------------------------------------------------
	print("\n[Tip] To load the index later:")
	print(f"""
	from annoy import AnnoyIndex

	index = AnnoyIndex({EMBEDDING_DIM}, '{METRIC}')
	index.load('{INDEX_PATH}') # Memory-mapped, very fast!

	# Query
	query_embedding = dino_model.encode(my_image)
	nearest_ids = index.get_nns_by_vector(query_embedding, 10)

	# Look up your metadata
	results = [my_metadata[id] for id in nearest_ids]
	""")


	if __name__ == "__main__":
	main()
No results found