Last active
September 5, 2025 14:26
-
-
Save kkirsche/924f7b3c294461fcb08f93c2dac32c44 to your computer and use it in GitHub Desktop.
Semantic Search Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.13" | |
| # dependencies = [ | |
| # "sentence-transformers", | |
| # "sqlite-vec", | |
| # "numpy", | |
| # ] | |
| # /// | |
| import sqlite3 | |
| import sentence_transformers | |
| import sqlite_vec | |
| MODEL_NAME = "baai/bge-large-en-v1.5" | |
| DOCUMENTS = [ | |
| "A guide to training your new puppy.", | |
| "The best recipes for homemade pasta.", | |
| "Exploring the Andes: a journey through South America.", | |
| "Recent breakthroughs in quantum computing.", | |
| "How to keep your house cat happy and healthy.", | |
| "The fundamentals of machine learning.", | |
| "A brief history of the Roman Empire.", | |
| "Tips for hiking in the mountains.", | |
| ] | |
| def setup_database(conn: sqlite3.Connection, vector_dim: int) -> None: | |
| """A minimal, isolated test of the built-in trigram tokenizer.""" | |
| print("Setting up the database schema") | |
| conn.enable_load_extension(True) | |
| conn.load_extension(sqlite_vec.loadable_path()) | |
| conn.enable_load_extension(False) | |
| print("-> sqlite-vec extension loaded.") | |
| conn.execute(""" | |
| CREATE TABLE documents ( | |
| id INTEGER PRIMARY KEY, | |
| text TEXT NOT NULL, | |
| embedding BLOB NOT NULL | |
| ) | |
| """) | |
| print("-> 'documents' table created.") | |
| if not isinstance(vector_dim, int): | |
| raise TypeError("Vector dimensions were not an integer") | |
| conn.execute(f""" | |
| CREATE VIRTUAL TABLE documents_vec USING vec0 ( | |
| embedding FLOAT[{vector_dim}] | |
| ) | |
| """) | |
| print("-> 'documents_vec' virtual table created.") | |
| conn.execute(""" | |
| CREATE TRIGGER documents_insert AFTER INSERT ON documents | |
| BEGIN | |
| INSERT INTO documents_vec (rowid, embedding) VALUES (new.id, new.embedding); | |
| END; | |
| """) | |
| print("-> sync triggers created") | |
| def ingest_data( | |
| conn: sqlite3.Connection, model: sentence_transformers.SentenceTransformer | |
| ) -> None: | |
| print("\nembedding and ingesting documents...") | |
| embeddings = model.encode( | |
| DOCUMENTS, | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| prompt="Represent this text for semantic search retrieval", | |
| ) | |
| for i, (doc, emb) in enumerate(zip(DOCUMENTS, embeddings, strict=True)): | |
| conn.execute( | |
| "INSERT INTO documents (id, text, embedding) VALUES (?, ?, ?)", | |
| (i + 1, doc, emb.astype("float32").tobytes()), | |
| ) | |
| print(f" -> ingested: '{doc[:30]}...'") | |
| conn.commit() | |
| def semantic_search( | |
| conn: sqlite3.Connection, | |
| model: sentence_transformers.SentenceTransformer, | |
| query: str, | |
| top_k: int = 3, | |
| ) -> None: | |
| print(f"\n--- searching for: '{query}' ---") | |
| embedding = model.encode( | |
| query, | |
| convert_to_numpy=True, | |
| normalize_embeddings=True, | |
| prompt="Represent this text for searching relevant semantic search results: ", | |
| ) | |
| cur = conn.execute( | |
| """ | |
| SELECT | |
| d.text, | |
| 1 - vec_distance_cosine(dv.embedding, ?) AS similarity | |
| FROM documents_vec dv | |
| JOIN documents d on d.id = dv.rowid | |
| ORDER BY similarity DESC | |
| LIMIT ? | |
| """, | |
| (embedding.astype("float32").tobytes(), top_k), | |
| ) | |
| rows = cur.fetchall() | |
| if not rows: | |
| print("no results found.") | |
| return | |
| print(f"top {len(rows)} results:") | |
| for i, (text, similarity) in enumerate(rows): | |
| print(f" {i + 1}. '{text}' (similarity: {similarity:.4f})") | |
| def main() -> None: | |
| model = sentence_transformers.SentenceTransformer(MODEL_NAME) | |
| vector_dim = model.get_sentence_embedding_dimension() | |
| if vector_dim is None: | |
| raise ValueError("model is expected to have vector dimension") | |
| conn = sqlite3.connect(":memory:") | |
| try: | |
| setup_database(conn, vector_dim) | |
| ingest_data(conn, model) | |
| semantic_search(conn, model, "information about domestic pets") | |
| semantic_search(conn, model, "planning a trip to South America") | |
| semantic_search(conn, model, "advanced computer science topics") | |
| finally: | |
| conn.close() | |
| if __name__ == "__main__": | |
| # Set the start method for multiprocessing compatibility with sentence-transformers | |
| # This is particularly important on macOS and Windows. | |
| import multiprocessing | |
| multiprocessing.set_start_method("spawn", force=True) | |
| main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ❯ uv run --script main.py | |
| Setting up the database schema | |
| -> sqlite-vec extension loaded. | |
| -> 'documents' table created. | |
| -> 'documents_vec' virtual table created. | |
| -> sync triggers created | |
| embedding and ingesting documents... | |
| -> ingested: 'A guide to training your new p...' | |
| -> ingested: 'The best recipes for homemade ...' | |
| -> ingested: 'Exploring the Andes: a journey...' | |
| -> ingested: 'Recent breakthroughs in quantu...' | |
| -> ingested: 'How to keep your house cat hap...' | |
| -> ingested: 'The fundamentals of machine le...' | |
| -> ingested: 'A brief history of the Roman E...' | |
| -> ingested: 'Tips for hiking in the mountai...' | |
| --- searching for: 'information about domestic pets' --- | |
| top 3 results: | |
| 1. 'A guide to training your new puppy.' (similarity: 0.8115) | |
| 2. 'How to keep your house cat happy and healthy.' (similarity: 0.7468) | |
| 3. 'The best recipes for homemade pasta.' (similarity: 0.6954) | |
| --- searching for: 'planning a trip to South America' --- | |
| top 3 results: | |
| 1. 'Exploring the Andes: a journey through South America.' (similarity: 0.8775) | |
| 2. 'Tips for hiking in the mountains.' (similarity: 0.7697) | |
| 3. 'The best recipes for homemade pasta.' (similarity: 0.6841) | |
| --- searching for: 'advanced computer science topics' --- | |
| top 3 results: | |
| 1. 'The fundamentals of machine learning.' (similarity: 0.8121) | |
| 2. 'Recent breakthroughs in quantum computing.' (similarity: 0.7713) | |
| 3. 'Exploring the Andes: a journey through South America.' (similarity: 0.7360) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment