nreimers · March 27, 2024 12:29
diff --git a/Opensearch_int8_search.py b/Opensearch_int8_search.py
 # This code shows how to index data using Cohere Embed v3 byte (int8) embeddings.
 # This gives you a 4x memory reduction while keeping 99.9% of the search quality.
 # Make sure to have OpenSearch running with at least version 2.9. E.g. by using docker:
 # docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.11.1
 # You also need the OpenSearch python client installed.
 # pip install cohere opensearch-py


 from opensearchpy import OpenSearch, helpers
 import cohere 
 import time 

 #Disable SSL warnings from OpenSearch
 from requests.packages import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

 # The index we want to create
 index_name = "test_index"

 # Documents we want to index. Feel free to replace documents with a larger set of docs you have
 documents = [
    "Alan Turing  was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
    "Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
    "Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
    "Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
 ]

 #Step 1: Get your Cohere API key from: www.cohere.com
 api_key = "<<YOUR_API_KEY>>"
 co = cohere.Client(api_key)

 # Step 2: Connect to your OpenSearch instance. Here we use the default user: admin and password: admin
 # You might need to replace the host/port/username/password if OpenSearch is hosted somewhere else
 os_client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_auth=('admin', 'admin'),
    use_ssl=True,
    verify_certs=False
 )

 # Step 3: Create the needed OpenSearch index
 # We specify a 'text_emb' property, that has "data_type": "byte". As engine we must use Lucene 
 index_body = {
    'settings': {
        'index': {
            'number_of_shards': 1,
            "knn": True,
            "knn.algo_param.ef_search": 100
        }
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "text_emb": {
                "type": "knn_vector",
                "dimension": 1024,      #Use 1024 for the large models, 384 for the light models
                "data_type": "byte",    #Set data_type as byte
                "method": {
                    "name": "hnsw",
                    "space_type": "cosinesimil",
                    "engine": "lucene", #Set Lucene as your engine
                    "parameters": {
                        "ef_construction": 256,
                        "m": 48
                    }
                }
            }
        }
    }
 }


 # Delete the index if it exists already
 os_client.indices.delete(index=index_name, ignore=[400, 404])

 # Create the new index
 response = os_client.indices.create(index_name, body=index_body)
 print("OpenSearch index created:", response)



 # Step 4: Upsert text and embeddings to OpenSearch in batches. 
 # We use a batch size of 512, embed the documents and then index this to OpenSearch
 batch_size = 512
 doc_id = 0
 for start_idx in range(0, len(documents), batch_size):
    batch_documents = documents[start_idx:start_idx+batch_size]

    # Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "int8"
    embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["int8"]).embeddings.int8

    # Do a bulk upsert to OpenSearch
    batch = []
    for document, doc_emb in zip(batch_documents, embeddings):
        batch.append({
                "_index": index_name,
                "_id": doc_id,
                "_source": {
                    "text": document,
                    "text_emb": doc_emb
                }
            })
        doc_id += 1
    helpers.bulk(os_client, batch)

 print("Indexing of documents finished")

 # Give opensearch some time to index the data
 time.sleep(1)

 # Step 5: Search in your index. First we define the query
 query = "Who discovered x-ray?"

 # Make sure to set input_type="search_query" when getting the embeddings for the query
 query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["int8"]).embeddings.int8[0]

 # Define your OpenSearch query and send it to OpenSearch
 top_k = 3
 query_body = {
    "size": top_k,
    "query": {
        "knn": {
        "text_emb": {
            "vector": query_emb,
            "k": top_k
            }
        }
    }
 }
 hits = os_client.search(index=index_name, body=query_body)["hits"]["hits"]

 # Print the results
 print("Query:", query)
 for hit in hits:
    print(hit['_score'], hit['_source']['text'])
	# This code shows how to index data using Cohere Embed v3 byte (int8) embeddings.
	# This gives you a 4x memory reduction while keeping 99.9% of the search quality.
	# Make sure to have OpenSearch running with at least version 2.9. E.g. by using docker:
	# docker run -d -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.11.1
	# You also need the OpenSearch python client installed.
	# pip install cohere opensearch-py


	from opensearchpy import OpenSearch, helpers
	import cohere
	import time

	#Disable SSL warnings from OpenSearch
	from requests.packages import urllib3
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	# The index we want to create
	index_name = "test_index"

	# Documents we want to index. Feel free to replace documents with a larger set of docs you have
	documents = [
	"Alan Turing was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.",
	"Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.",
	"Isaac Newton was an English polymath active as a mathematician, physicist, astronomer, alchemist, theologian, and author who was described in his time as a natural philosopher.",
	"Marie Curie was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity"
	]

	#Step 1: Get your Cohere API key from: www.cohere.com
	api_key = "<<YOUR_API_KEY>>"
	co = cohere.Client(api_key)

	# Step 2: Connect to your OpenSearch instance. Here we use the default user: admin and password: admin
	# You might need to replace the host/port/username/password if OpenSearch is hosted somewhere else
	os_client = OpenSearch(
	hosts=[{'host': 'localhost', 'port': 9200}],
	http_auth=('admin', 'admin'),
	use_ssl=True,
	verify_certs=False
	)

	# Step 3: Create the needed OpenSearch index
	# We specify a 'text_emb' property, that has "data_type": "byte". As engine we must use Lucene
	index_body = {
	'settings': {
	'index': {
	'number_of_shards': 1,
	"knn": True,
	"knn.algo_param.ef_search": 100
	}
	},
	"mappings": {
	"properties": {
	"text": {"type": "text"},
	"text_emb": {
	"type": "knn_vector",
	"dimension": 1024, #Use 1024 for the large models, 384 for the light models
	"data_type": "byte", #Set data_type as byte
	"method": {
	"name": "hnsw",
	"space_type": "cosinesimil",
	"engine": "lucene", #Set Lucene as your engine
	"parameters": {
	"ef_construction": 256,
	"m": 48
	}
	}
	}
	}
	}
	}


	# Delete the index if it exists already
	os_client.indices.delete(index=index_name, ignore=[400, 404])

	# Create the new index
	response = os_client.indices.create(index_name, body=index_body)
	print("OpenSearch index created:", response)



	# Step 4: Upsert text and embeddings to OpenSearch in batches.
	# We use a batch size of 512, embed the documents and then index this to OpenSearch
	batch_size = 512
	doc_id = 0
	for start_idx in range(0, len(documents), batch_size):
	batch_documents = documents[start_idx:start_idx+batch_size]

	# Compute the int8 embeddings of your documents. Set input_type to "search_document" and embedding_types to "int8"
	embeddings = co.embed(texts=batch_documents, model="embed-english-v3.0", input_type="search_document", embedding_types=["int8"]).embeddings.int8

	# Do a bulk upsert to OpenSearch
	batch = []
	for document, doc_emb in zip(batch_documents, embeddings):
	batch.append({
	"_index": index_name,
	"_id": doc_id,
	"_source": {
	"text": document,
	"text_emb": doc_emb
	}
	})
	doc_id += 1
	helpers.bulk(os_client, batch)

	print("Indexing of documents finished")

	# Give opensearch some time to index the data
	time.sleep(1)

	# Step 5: Search in your index. First we define the query
	query = "Who discovered x-ray?"

	# Make sure to set input_type="search_query" when getting the embeddings for the query
	query_emb = co.embed(texts=[query], model="embed-english-v3.0", input_type="search_query", embedding_types=["int8"]).embeddings.int8[0]

	# Define your OpenSearch query and send it to OpenSearch
	top_k = 3
	query_body = {
	"size": top_k,
	"query": {
	"knn": {
	"text_emb": {
	"vector": query_emb,
	"k": top_k
	}
	}
	}
	}
	hits = os_client.search(index=index_name, body=query_body)["hits"]["hits"]

	# Print the results
	print("Query:", query)
	for hit in hits:
	print(hit['_score'], hit['_source']['text'])
No results found