Skip to content

Instantly share code, notes, and snippets.

@nreimers
Last active March 27, 2024 12:29
Show Gist options
  • Select an option

  • Save nreimers/3935ab5aeea076bc78c6b8d969ca3225 to your computer and use it in GitHub Desktop.

Select an option

Save nreimers/3935ab5aeea076bc78c6b8d969ca3225 to your computer and use it in GitHub Desktop.
OpenSearch int8 search with Cohere
# This code shows how to index data using Cohere Embed v3 byte (int8) embeddings.
# This gives you a 4x memory reduction while keeping 99.9% of the search quality.
# Make sure to have OpenSearch running with at least version 2.9. E.g. by using docker:
# docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.11.1
# You also need the OpenSearch python client installed.
# pip install cohere opensearch-py
from opensearchpy import OpenSearch, helpers
import cohere
import time
#Disable SSL warnings from OpenSearch
from requests.packages import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# The index we want to create
index_name = "test_index"
# Documents we want to index. Feel free to replace documents with a larger set of docs you have
documents = [
"Alan Turing was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
"Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
"Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
"Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
]
#Step 1: Get your Cohere API key from: www.cohere.com
api_key = "<<YOUR_API_KEY>>"
co = cohere.Client(api_key)
# Step 2: Connect to your OpenSearch instance. Here we use the default user: admin and password: admin
# You might need to replace the host/port/username/password if OpenSearch is hosted somewhere else
os_client = OpenSearch(
hosts=[{'host': 'localhost', 'port': 9200}],
http_auth=('admin', 'admin'),
use_ssl=True,
verify_certs=False
)
# Step 3: Create the needed OpenSearch index
# We specify a 'text_emb' property, that has "data_type": "byte". As engine we must use Lucene
index_body = {
'settings': {
'index': {
'number_of_shards': 1,
"knn": True,
"knn.algo_param.ef_search": 100
}
},
"mappings": {
"properties": {
"text": {"type": "text"},
"text_emb": {
"type": "knn_vector",
"dimension": 1024, #Use 1024 for the large models, 384 for the light models
"data_type": "byte", #Set data_type as byte
"method": {
"name": "hnsw",
"space_type": "cosinesimil",
"engine": "lucene", #Set Lucene as your engine
"parameters": {
"ef_construction": 256,
"m": 48
}
}
}
}
}
}
# Delete the index if it exists already
os_client.indices.delete(index=index_name, ignore=[400, 404])
# Create the new index
response = os_client.indices.create(index_name, body=index_body)
print("OpenSearch index created:", response)
# Step 4: Upsert text and embeddings to OpenSearch in batches.
# We use a batch size of 512, embed the documents and then index this to OpenSearch
batch_size = 512
doc_id = 0
for start_idx in range(0, len(documents), batch_size):
batch_documents = documents[start_idx:start_idx+batch_size]
# Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "int8"
embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["int8"]).embeddings.int8
# Do a bulk upsert to OpenSearch
batch = []
for document, doc_emb in zip(batch_documents, embeddings):
batch.append({
"_index": index_name,
"_id": doc_id,
"_source": {
"text": document,
"text_emb": doc_emb
}
})
doc_id += 1
helpers.bulk(os_client, batch)
print("Indexing of documents finished")
# Give opensearch some time to index the data
time.sleep(1)
# Step 5: Search in your index. First we define the query
query = "Who discovered x-ray?"
# Make sure to set input_type="search_query" when getting the embeddings for the query
query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["int8"]).embeddings.int8[0]
# Define your OpenSearch query and send it to OpenSearch
top_k = 3
query_body = {
"size": top_k,
"query": {
"knn": {
"text_emb": {
"vector": query_emb,
"k": top_k
}
}
}
}
hits = os_client.search(index=index_name, body=query_body)["hits"]["hits"]
# Print the results
print("Query:", query)
for hit in hits:
print(hit['_score'], hit['_source']['text'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment