Skip to content

Instantly share code, notes, and snippets.

@kkirsche
Last active September 5, 2025 14:26
Show Gist options
  • Select an option

  • Save kkirsche/924f7b3c294461fcb08f93c2dac32c44 to your computer and use it in GitHub Desktop.

Select an option

Save kkirsche/924f7b3c294461fcb08f93c2dac32c44 to your computer and use it in GitHub Desktop.
Semantic Search Example
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "sentence-transformers",
# "sqlite-vec",
# "numpy",
# ]
# ///
import sqlite3
import sentence_transformers
import sqlite_vec
MODEL_NAME = "baai/bge-large-en-v1.5"
DOCUMENTS = [
"A guide to training your new puppy.",
"The best recipes for homemade pasta.",
"Exploring the Andes: a journey through South America.",
"Recent breakthroughs in quantum computing.",
"How to keep your house cat happy and healthy.",
"The fundamentals of machine learning.",
"A brief history of the Roman Empire.",
"Tips for hiking in the mountains.",
]
def setup_database(conn: sqlite3.Connection, vector_dim: int) -> None:
"""A minimal, isolated test of the built-in trigram tokenizer."""
print("Setting up the database schema")
conn.enable_load_extension(True)
conn.load_extension(sqlite_vec.loadable_path())
conn.enable_load_extension(False)
print("-> sqlite-vec extension loaded.")
conn.execute("""
CREATE TABLE documents (
id INTEGER PRIMARY KEY,
text TEXT NOT NULL,
embedding BLOB NOT NULL
)
""")
print("-> 'documents' table created.")
if not isinstance(vector_dim, int):
raise TypeError("Vector dimensions were not an integer")
conn.execute(f"""
CREATE VIRTUAL TABLE documents_vec USING vec0 (
embedding FLOAT[{vector_dim}]
)
""")
print("-> 'documents_vec' virtual table created.")
conn.execute("""
CREATE TRIGGER documents_insert AFTER INSERT ON documents
BEGIN
INSERT INTO documents_vec (rowid, embedding) VALUES (new.id, new.embedding);
END;
""")
print("-> sync triggers created")
def ingest_data(
conn: sqlite3.Connection, model: sentence_transformers.SentenceTransformer
) -> None:
print("\nembedding and ingesting documents...")
embeddings = model.encode(
DOCUMENTS,
convert_to_numpy=True,
normalize_embeddings=True,
prompt="Represent this text for semantic search retrieval",
)
for i, (doc, emb) in enumerate(zip(DOCUMENTS, embeddings, strict=True)):
conn.execute(
"INSERT INTO documents (id, text, embedding) VALUES (?, ?, ?)",
(i + 1, doc, emb.astype("float32").tobytes()),
)
print(f" -> ingested: '{doc[:30]}...'")
conn.commit()
def semantic_search(
conn: sqlite3.Connection,
model: sentence_transformers.SentenceTransformer,
query: str,
top_k: int = 3,
) -> None:
print(f"\n--- searching for: '{query}' ---")
embedding = model.encode(
query,
convert_to_numpy=True,
normalize_embeddings=True,
prompt="Represent this text for searching relevant semantic search results: ",
)
cur = conn.execute(
"""
SELECT
d.text,
1 - vec_distance_cosine(dv.embedding, ?) AS similarity
FROM documents_vec dv
JOIN documents d on d.id = dv.rowid
ORDER BY similarity DESC
LIMIT ?
""",
(embedding.astype("float32").tobytes(), top_k),
)
rows = cur.fetchall()
if not rows:
print("no results found.")
return
print(f"top {len(rows)} results:")
for i, (text, similarity) in enumerate(rows):
print(f" {i + 1}. '{text}' (similarity: {similarity:.4f})")
def main() -> None:
model = sentence_transformers.SentenceTransformer(MODEL_NAME)
vector_dim = model.get_sentence_embedding_dimension()
if vector_dim is None:
raise ValueError("model is expected to have vector dimension")
conn = sqlite3.connect(":memory:")
try:
setup_database(conn, vector_dim)
ingest_data(conn, model)
semantic_search(conn, model, "information about domestic pets")
semantic_search(conn, model, "planning a trip to South America")
semantic_search(conn, model, "advanced computer science topics")
finally:
conn.close()
if __name__ == "__main__":
# Set the start method for multiprocessing compatibility with sentence-transformers
# This is particularly important on macOS and Windows.
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)
main()
❯ uv run --script main.py
Setting up the database schema
-> sqlite-vec extension loaded.
-> 'documents' table created.
-> 'documents_vec' virtual table created.
-> sync triggers created
embedding and ingesting documents...
-> ingested: 'A guide to training your new p...'
-> ingested: 'The best recipes for homemade ...'
-> ingested: 'Exploring the Andes: a journey...'
-> ingested: 'Recent breakthroughs in quantu...'
-> ingested: 'How to keep your house cat hap...'
-> ingested: 'The fundamentals of machine le...'
-> ingested: 'A brief history of the Roman E...'
-> ingested: 'Tips for hiking in the mountai...'
--- searching for: 'information about domestic pets' ---
top 3 results:
1. 'A guide to training your new puppy.' (similarity: 0.8115)
2. 'How to keep your house cat happy and healthy.' (similarity: 0.7468)
3. 'The best recipes for homemade pasta.' (similarity: 0.6954)
--- searching for: 'planning a trip to South America' ---
top 3 results:
1. 'Exploring the Andes: a journey through South America.' (similarity: 0.8775)
2. 'Tips for hiking in the mountains.' (similarity: 0.7697)
3. 'The best recipes for homemade pasta.' (similarity: 0.6841)
--- searching for: 'advanced computer science topics' ---
top 3 results:
1. 'The fundamentals of machine learning.' (similarity: 0.8121)
2. 'Recent breakthroughs in quantum computing.' (similarity: 0.7713)
3. 'Exploring the Andes: a journey through South America.' (similarity: 0.7360)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment