kkirsche · September 5, 2025 14:26
diff --git a/main.py b/main.py
 # /// script
 # requires-python = ">=3.13"
 # dependencies = [
 #     "sentence-transformers",
 #     "sqlite-vec",
 #     "numpy",
 # ]
 # ///


 import sqlite3

 import sentence_transformers
 import sqlite_vec

 MODEL_NAME = "baai/bge-large-en-v1.5"

 DOCUMENTS = [
    "A guide to training your new puppy.",
    "The best recipes for homemade pasta.",
    "Exploring the Andes: a journey through South America.",
    "Recent breakthroughs in quantum computing.",
    "How to keep your house cat happy and healthy.",
    "The fundamentals of machine learning.",
    "A brief history of the Roman Empire.",
    "Tips for hiking in the mountains.",
 ]


 def setup_database(conn: sqlite3.Connection, vector_dim: int) -> None:
    """A minimal, isolated test of the built-in trigram tokenizer."""
    print("Setting up the database schema")
    conn.enable_load_extension(True)
    conn.load_extension(sqlite_vec.loadable_path())
    conn.enable_load_extension(False)
    print("-> sqlite-vec extension loaded.")

    conn.execute("""
    CREATE TABLE documents (
        id INTEGER PRIMARY KEY,
        text TEXT NOT NULL,
        embedding BLOB NOT NULL
    )
    """)
    print("-> 'documents' table created.")

    if not isinstance(vector_dim, int):
        raise TypeError("Vector dimensions were not an integer")

    conn.execute(f"""
    CREATE VIRTUAL TABLE documents_vec USING vec0 (
        embedding FLOAT[{vector_dim}]
    )
    """)
    print("-> 'documents_vec' virtual table created.")

    conn.execute("""
    CREATE TRIGGER documents_insert AFTER INSERT ON documents
    BEGIN
        INSERT INTO documents_vec (rowid, embedding) VALUES (new.id, new.embedding);
    END;
    """)
    print("-> sync triggers created")


 def ingest_data(
    conn: sqlite3.Connection, model: sentence_transformers.SentenceTransformer
 ) -> None:
    print("\nembedding and ingesting documents...")
    embeddings = model.encode(
        DOCUMENTS,
        convert_to_numpy=True,
        normalize_embeddings=True,
        prompt="Represent this text for semantic search retrieval",
    )
    for i, (doc, emb) in enumerate(zip(DOCUMENTS, embeddings, strict=True)):
        conn.execute(
            "INSERT INTO documents (id, text, embedding) VALUES (?, ?, ?)",
            (i + 1, doc, emb.astype("float32").tobytes()),
        )
        print(f"  -> ingested: '{doc[:30]}...'")
    conn.commit()


 def semantic_search(
    conn: sqlite3.Connection,
    model: sentence_transformers.SentenceTransformer,
    query: str,
    top_k: int = 3,
 ) -> None:
    print(f"\n--- searching for: '{query}' ---")
    embedding = model.encode(
        query,
        convert_to_numpy=True,
        normalize_embeddings=True,
        prompt="Represent this text for searching relevant semantic search results: ",
    )
    cur = conn.execute(
        """
    SELECT
        d.text,
        1 - vec_distance_cosine(dv.embedding, ?) AS similarity
    FROM documents_vec dv
    JOIN documents d on d.id = dv.rowid
    ORDER BY similarity DESC
    LIMIT ?
    """,
        (embedding.astype("float32").tobytes(), top_k),
    )
    rows = cur.fetchall()
    if not rows:
        print("no results found.")
        return

    print(f"top {len(rows)} results:")
    for i, (text, similarity) in enumerate(rows):
        print(f"  {i + 1}. '{text}' (similarity: {similarity:.4f})")


 def main() -> None:
    model = sentence_transformers.SentenceTransformer(MODEL_NAME)
    vector_dim = model.get_sentence_embedding_dimension()
    if vector_dim is None:
        raise ValueError("model is expected to have vector dimension")
    conn = sqlite3.connect(":memory:")
    try:
        setup_database(conn, vector_dim)
        ingest_data(conn, model)

        semantic_search(conn, model, "information about domestic pets")
        semantic_search(conn, model, "planning a trip to South America")
        semantic_search(conn, model, "advanced computer science topics")
    finally:
        conn.close()


 if __name__ == "__main__":
    # Set the start method for multiprocessing compatibility with sentence-transformers
    # This is particularly important on macOS and Windows.
    import multiprocessing

    multiprocessing.set_start_method("spawn", force=True)

    main()
diff --git a/output.txt b/output.txt
 ❯ uv run --script main.py
 Setting up the database schema
 -> sqlite-vec extension loaded.
 -> 'documents' table created.
 -> 'documents_vec' virtual table created.
 -> sync triggers created

 embedding and ingesting documents...
  -> ingested: 'A guide to training your new p...'
  -> ingested: 'The best recipes for homemade ...'
  -> ingested: 'Exploring the Andes: a journey...'
  -> ingested: 'Recent breakthroughs in quantu...'
  -> ingested: 'How to keep your house cat hap...'
  -> ingested: 'The fundamentals of machine le...'
  -> ingested: 'A brief history of the Roman E...'
  -> ingested: 'Tips for hiking in the mountai...'

 --- searching for: 'information about domestic pets' ---
 top 3 results:
  1. 'A guide to training your new puppy.' (similarity: 0.8115)
  2. 'How to keep your house cat happy and healthy.' (similarity: 0.7468)
  3. 'The best recipes for homemade pasta.' (similarity: 0.6954)

 --- searching for: 'planning a trip to South America' ---
 top 3 results:
  1. 'Exploring the Andes: a journey through South America.' (similarity: 0.8775)
  2. 'Tips for hiking in the mountains.' (similarity: 0.7697)
  3. 'The best recipes for homemade pasta.' (similarity: 0.6841)

 --- searching for: 'advanced computer science topics' ---
 top 3 results:
  1. 'The fundamentals of machine learning.' (similarity: 0.8121)
  2. 'Recent breakthroughs in quantum computing.' (similarity: 0.7713)
  3. 'Exploring the Andes: a journey through South America.' (similarity: 0.7360)
	# /// script
	# requires-python = ">=3.13"
	# dependencies = [
	# "sentence-transformers",
	# "sqlite-vec",
	# "numpy",
	# ]
	# ///


	import sqlite3

	import sentence_transformers
	import sqlite_vec

	MODEL_NAME = "baai/bge-large-en-v1.5"

	DOCUMENTS = [
	"A guide to training your new puppy.",
	"The best recipes for homemade pasta.",
	"Exploring the Andes: a journey through South America.",
	"Recent breakthroughs in quantum computing.",
	"How to keep your house cat happy and healthy.",
	"The fundamentals of machine learning.",
	"A brief history of the Roman Empire.",
	"Tips for hiking in the mountains.",
	]


	def setup_database(conn: sqlite3.Connection, vector_dim: int) -> None:
	"""A minimal, isolated test of the built-in trigram tokenizer."""
	print("Setting up the database schema")
	conn.enable_load_extension(True)
	conn.load_extension(sqlite_vec.loadable_path())
	conn.enable_load_extension(False)
	print("-> sqlite-vec extension loaded.")

	conn.execute("""
	CREATE TABLE documents (
	id INTEGER PRIMARY KEY,
	text TEXT NOT NULL,
	embedding BLOB NOT NULL
	)
	""")
	print("-> 'documents' table created.")

	if not isinstance(vector_dim, int):
	raise TypeError("Vector dimensions were not an integer")

	conn.execute(f"""
	CREATE VIRTUAL TABLE documents_vec USING vec0 (
	embedding FLOAT[{vector_dim}]
	)
	""")
	print("-> 'documents_vec' virtual table created.")

	conn.execute("""
	CREATE TRIGGER documents_insert AFTER INSERT ON documents
	BEGIN
	INSERT INTO documents_vec (rowid, embedding) VALUES (new.id, new.embedding);
	END;
	""")
	print("-> sync triggers created")


	def ingest_data(
	conn: sqlite3.Connection, model: sentence_transformers.SentenceTransformer
	) -> None:
	print("\nembedding and ingesting documents...")
	embeddings = model.encode(
	DOCUMENTS,
	convert_to_numpy=True,
	normalize_embeddings=True,
	prompt="Represent this text for semantic search retrieval",
	)
	for i, (doc, emb) in enumerate(zip(DOCUMENTS, embeddings, strict=True)):
	conn.execute(
	"INSERT INTO documents (id, text, embedding) VALUES (?, ?, ?)",
	(i + 1, doc, emb.astype("float32").tobytes()),
	)
	print(f" -> ingested: '{doc[:30]}...'")
	conn.commit()


	def semantic_search(
	conn: sqlite3.Connection,
	model: sentence_transformers.SentenceTransformer,
	query: str,
	top_k: int = 3,
	) -> None:
	print(f"\n--- searching for: '{query}' ---")
	embedding = model.encode(
	query,
	convert_to_numpy=True,
	normalize_embeddings=True,
	prompt="Represent this text for searching relevant semantic search results: ",
	)
	cur = conn.execute(
	"""
	SELECT
	d.text,
	1 - vec_distance_cosine(dv.embedding, ?) AS similarity
	FROM documents_vec dv
	JOIN documents d on d.id = dv.rowid
	ORDER BY similarity DESC
	LIMIT ?
	""",
	(embedding.astype("float32").tobytes(), top_k),
	)
	rows = cur.fetchall()
	if not rows:
	print("no results found.")
	return

	print(f"top {len(rows)} results:")
	for i, (text, similarity) in enumerate(rows):
	print(f" {i + 1}. '{text}' (similarity: {similarity:.4f})")


	def main() -> None:
	model = sentence_transformers.SentenceTransformer(MODEL_NAME)
	vector_dim = model.get_sentence_embedding_dimension()
	if vector_dim is None:
	raise ValueError("model is expected to have vector dimension")
	conn = sqlite3.connect(":memory:")
	try:
	setup_database(conn, vector_dim)
	ingest_data(conn, model)

	semantic_search(conn, model, "information about domestic pets")
	semantic_search(conn, model, "planning a trip to South America")
	semantic_search(conn, model, "advanced computer science topics")
	finally:
	conn.close()


	if __name__ == "__main__":
	# Set the start method for multiprocessing compatibility with sentence-transformers
	# This is particularly important on macOS and Windows.
	import multiprocessing

	multiprocessing.set_start_method("spawn", force=True)

	main()
	❯ uv run --script main.py
	Setting up the database schema
	-> sqlite-vec extension loaded.
	-> 'documents' table created.
	-> 'documents_vec' virtual table created.
	-> sync triggers created

	embedding and ingesting documents...
	-> ingested: 'A guide to training your new p...'
	-> ingested: 'The best recipes for homemade ...'
	-> ingested: 'Exploring the Andes: a journey...'
	-> ingested: 'Recent breakthroughs in quantu...'
	-> ingested: 'How to keep your house cat hap...'
	-> ingested: 'The fundamentals of machine le...'
	-> ingested: 'A brief history of the Roman E...'
	-> ingested: 'Tips for hiking in the mountai...'

	--- searching for: 'information about domestic pets' ---
	top 3 results:
	1. 'A guide to training your new puppy.' (similarity: 0.8115)
	2. 'How to keep your house cat happy and healthy.' (similarity: 0.7468)
	3. 'The best recipes for homemade pasta.' (similarity: 0.6954)

	--- searching for: 'planning a trip to South America' ---
	top 3 results:
	1. 'Exploring the Andes: a journey through South America.' (similarity: 0.8775)
	2. 'Tips for hiking in the mountains.' (similarity: 0.7697)
	3. 'The best recipes for homemade pasta.' (similarity: 0.6841)

	--- searching for: 'advanced computer science topics' ---
	top 3 results:
	1. 'The fundamentals of machine learning.' (similarity: 0.8121)
	2. 'Recent breakthroughs in quantum computing.' (similarity: 0.7713)
	3. 'Exploring the Andes: a journey through South America.' (similarity: 0.7360)