Last active
March 21, 2024 20:21
-
-
Save nreimers/ed20d8c4f6f33cd996677f50ae1728b9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This example shows how to use Cohere binary embeddings to get a 32x reduction in memory | |
| # and up to a 40x faster search speed. | |
| # You need the Cohere Python SDK as well as faiss | |
| # pip install cohere faiss-cpu numpy | |
| import faiss | |
| import cohere | |
| import numpy as np | |
| documents = [ | |
| "Alan Turing was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.", | |
| "Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.", | |
| "Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.", | |
| "Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity" | |
| ] | |
| #Step 1: Get your Cohere API key from: www.cohere.com | |
| api_key = "<<YOUR_API_KEY>>" | |
| co = cohere.Client(api_key) | |
| # Step 2: Create a faiss IndexBinaryFlat index | |
| num_dim = 1024 #Use 1024 dimensions for the embed-english-v3.0 and 384 for the light models | |
| index = faiss.IndexBinaryFlat(num_dim) | |
| # Step 3: Compute the document embeddings in batches | |
| batch_size = 384 | |
| doc_id = 0 | |
| for start_idx in range(0, len(documents), batch_size): | |
| batch_documents = documents[start_idx:start_idx+batch_size] | |
| # Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "ubinary" | |
| embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["ubinary"]).embeddings.ubinary | |
| #Cast embeddings to numpy | |
| embeddings = np.asarray(embeddings, dtype='uint8') | |
| #Add the embeddings to the faiss index | |
| index.add(embeddings) | |
| print("Indexing of documents finished") | |
| # Optional: Write index to disk | |
| index_name = "my_index.bin" | |
| faiss.write_index_binary(index, index_name) | |
| # Optional: Load index from disc | |
| index = faiss.read_index_binary(index_name) | |
| # The following is the function that takes in a query, calls the API to get the binary and the float embeddings | |
| # We use the binary embedding for a quick search in our faiss IndexBinaryFlat index. | |
| # There we retrieve the 10*top_k results. For these results, we re-score them with the float query embedding | |
| # and the binary document embedding. This gives another boost in search quality. | |
| def search(index, query, top_k=3): | |
| # Make sure to set input_type="search_query" | |
| query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["ubinary", "float"]).embeddings | |
| query_emb_bin = np.asarray(query_emb.ubinary, dtype='uint8') | |
| query_emb_float = np.asarray(query_emb.float, dtype="float32") | |
| # Phase I: Search on the index with a binary | |
| hits_scores, hits_doc_ids = index.search(query_emb_bin, k=min(10*top_k, index.ntotal)) | |
| #Get the results in a list of hits | |
| hits = [{'doc_id': doc_id.item(), 'score_bin': score_bin} for doc_id, score_bin in zip(hits_doc_ids[0], hits_scores[0])] | |
| # Phase II: Do a re-scoring with the float query embedding | |
| binary_doc_emb = np.asarray([index.reconstruct(hit['doc_id']) for hit in hits]) | |
| binary_doc_emb_unpacked = np.unpackbits(binary_doc_emb, axis=-1).astype("int") | |
| binary_doc_emb_unpacked = 2*binary_doc_emb_unpacked-1 | |
| scores_cont = (query_emb_float[0] @ binary_doc_emb_unpacked.T) | |
| for idx in range(len(scores_cont)): | |
| hits[idx]['score_cont'] = scores_cont[idx] | |
| #Sort by largest score_cont | |
| hits.sort(key=lambda x: x['score_cont'], reverse=True) | |
| return hits[0:top_k] | |
| # Search in your index | |
| query = "Who discovered x-ray?" | |
| print("Query:", query) | |
| hits = search(index, query) | |
| for hit in hits: | |
| print(f"{hit['score_cont']:.2f}", documents[hit['doc_id']]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment