Last active
March 27, 2024 12:29
-
-
Save nreimers/3935ab5aeea076bc78c6b8d969ca3225 to your computer and use it in GitHub Desktop.
OpenSearch int8 search with Cohere
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This code shows how to index data using Cohere Embed v3 byte (int8) embeddings. | |
| # This gives you a 4x memory reduction while keeping 99.9% of the search quality. | |
| # Make sure to have OpenSearch running with at least version 2.9. E.g. by using docker: | |
| # docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.11.1 | |
| # You also need the OpenSearch python client installed. | |
| # pip install cohere opensearch-py | |
| from opensearchpy import OpenSearch, helpers | |
| import cohere | |
| import time | |
| #Disable SSL warnings from OpenSearch | |
| from requests.packages import urllib3 | |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
| # The index we want to create | |
| index_name = "test_index" | |
| # Documents we want to index. Feel free to replace documents with a larger set of docs you have | |
| documents = [ | |
| "Alan Turing was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.", | |
| "Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.", | |
| "Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.", | |
| "Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity" | |
| ] | |
| #Step 1: Get your Cohere API key from: www.cohere.com | |
| api_key = "<<YOUR_API_KEY>>" | |
| co = cohere.Client(api_key) | |
| # Step 2: Connect to your OpenSearch instance. Here we use the default user: admin and password: admin | |
| # You might need to replace the host/port/username/password if OpenSearch is hosted somewhere else | |
| os_client = OpenSearch( | |
| hosts=[{'host': 'localhost', 'port': 9200}], | |
| http_auth=('admin', 'admin'), | |
| use_ssl=True, | |
| verify_certs=False | |
| ) | |
| # Step 3: Create the needed OpenSearch index | |
| # We specify a 'text_emb' property, that has "data_type": "byte". As engine we must use Lucene | |
| index_body = { | |
| 'settings': { | |
| 'index': { | |
| 'number_of_shards': 1, | |
| "knn": True, | |
| "knn.algo_param.ef_search": 100 | |
| } | |
| }, | |
| "mappings": { | |
| "properties": { | |
| "text": {"type": "text"}, | |
| "text_emb": { | |
| "type": "knn_vector", | |
| "dimension": 1024, #Use 1024 for the large models, 384 for the light models | |
| "data_type": "byte", #Set data_type as byte | |
| "method": { | |
| "name": "hnsw", | |
| "space_type": "cosinesimil", | |
| "engine": "lucene", #Set Lucene as your engine | |
| "parameters": { | |
| "ef_construction": 256, | |
| "m": 48 | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Delete the index if it exists already | |
| os_client.indices.delete(index=index_name, ignore=[400, 404]) | |
| # Create the new index | |
| response = os_client.indices.create(index_name, body=index_body) | |
| print("OpenSearch index created:", response) | |
| # Step 4: Upsert text and embeddings to OpenSearch in batches. | |
| # We use a batch size of 512, embed the documents and then index this to OpenSearch | |
| batch_size = 512 | |
| doc_id = 0 | |
| for start_idx in range(0, len(documents), batch_size): | |
| batch_documents = documents[start_idx:start_idx+batch_size] | |
| # Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "int8" | |
| embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["int8"]).embeddings.int8 | |
| # Do a bulk upsert to OpenSearch | |
| batch = [] | |
| for document, doc_emb in zip(batch_documents, embeddings): | |
| batch.append({ | |
| "_index": index_name, | |
| "_id": doc_id, | |
| "_source": { | |
| "text": document, | |
| "text_emb": doc_emb | |
| } | |
| }) | |
| doc_id += 1 | |
| helpers.bulk(os_client, batch) | |
| print("Indexing of documents finished") | |
| # Give opensearch some time to index the data | |
| time.sleep(1) | |
| # Step 5: Search in your index. First we define the query | |
| query = "Who discovered x-ray?" | |
| # Make sure to set input_type="search_query" when getting the embeddings for the query | |
| query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["int8"]).embeddings.int8[0] | |
| # Define your OpenSearch query and send it to OpenSearch | |
| top_k = 3 | |
| query_body = { | |
| "size": top_k, | |
| "query": { | |
| "knn": { | |
| "text_emb": { | |
| "vector": query_emb, | |
| "k": top_k | |
| } | |
| } | |
| } | |
| } | |
| hits = os_client.search(index=index_name, body=query_body)["hits"]["hits"] | |
| # Print the results | |
| print("Query:", query) | |
| for hit in hits: | |
| print(hit['_score'], hit['_source']['text']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment