Skip to content

Instantly share code, notes, and snippets.

@nreimers
Last active March 21, 2024 20:21
Show Gist options
  • Select an option

  • Save nreimers/ed20d8c4f6f33cd996677f50ae1728b9 to your computer and use it in GitHub Desktop.

Select an option

Save nreimers/ed20d8c4f6f33cd996677f50ae1728b9 to your computer and use it in GitHub Desktop.
# This example shows how to use Cohere binary embeddings to get a 32x reduction in memory
# and up to a 40x faster search speed.
# You need the Cohere Python SDK as well as faiss
# pip install cohere faiss-cpu numpy
import faiss
import cohere
import numpy as np
documents = [
"Alan Turing was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
"Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
"Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
"Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
]
#Step 1: Get your Cohere API key from: www.cohere.com
api_key = "<<YOUR_API_KEY>>"
co = cohere.Client(api_key)
# Step 2: Create a faiss IndexBinaryFlat index
num_dim = 1024 #Use 1024 dimensions for the embed-english-v3.0 and 384 for the light models
index = faiss.IndexBinaryFlat(num_dim)
# Step 3: Compute the document embeddings in batches
batch_size = 384
doc_id = 0
for start_idx in range(0, len(documents), batch_size):
batch_documents = documents[start_idx:start_idx+batch_size]
# Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "ubinary"
embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["ubinary"]).embeddings.ubinary
#Cast embeddings to numpy
embeddings = np.asarray(embeddings, dtype='uint8')
#Add the embeddings to the faiss index
index.add(embeddings)
print("Indexing of documents finished")
# Optional: Write index to disk
index_name = "my_index.bin"
faiss.write_index_binary(index, index_name)
# Optional: Load index from disc
index = faiss.read_index_binary(index_name)
# The following is the function that takes in a query, calls the API to get the binary and the float embeddings
# We use the binary embedding for a quick search in our faiss IndexBinaryFlat index.
# There we retrieve the 10*top_k results. For these results, we re-score them with the float query embedding
# and the binary document embedding. This gives another boost in search quality.
def search(index, query, top_k=3):
# Make sure to set input_type="search_query"
query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["ubinary", "float"]).embeddings
query_emb_bin = np.asarray(query_emb.ubinary, dtype='uint8')
query_emb_float = np.asarray(query_emb.float, dtype="float32")
# Phase I: Search on the index with a binary
hits_scores, hits_doc_ids = index.search(query_emb_bin, k=min(10*top_k, index.ntotal))
#Get the results in a list of hits
hits = [{'doc_id': doc_id.item(), 'score_bin': score_bin} for doc_id, score_bin in zip(hits_doc_ids[0], hits_scores[0])]
# Phase II: Do a re-scoring with the float query embedding
binary_doc_emb = np.asarray([index.reconstruct(hit['doc_id']) for hit in hits])
binary_doc_emb_unpacked = np.unpackbits(binary_doc_emb, axis=-1).astype("int")
binary_doc_emb_unpacked = 2*binary_doc_emb_unpacked-1
scores_cont = (query_emb_float[0] @ binary_doc_emb_unpacked.T)
for idx in range(len(scores_cont)):
hits[idx]['score_cont'] = scores_cont[idx]
#Sort by largest score_cont
hits.sort(key=lambda x: x['score_cont'], reverse=True)
return hits[0:top_k]
# Search in your index
query = "Who discovered x-ray?"
print("Query:", query)
hits = search(index, query)
for hit in hits:
print(f"{hit['score_cont']:.2f}", documents[hit['doc_id']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment