Skip to content

Instantly share code, notes, and snippets.

@shencan
Forked from jasonforte/.env
Created November 16, 2025 12:49
Show Gist options
  • Select an option

  • Save shencan/bf0a48eb9a2e61ba1892bce01783e121 to your computer and use it in GitHub Desktop.

Select an option

Save shencan/bf0a48eb9a2e61ba1892bce01783e121 to your computer and use it in GitHub Desktop.
Example Python code for interacting with Amazon S3 Vectors (boto3 & langchain)
REGION_NAME = "us-east-1"
AWS_DEFAULT_PROFILE = "demo"
MODEL_ID = "amazon.titan-embed-text-v2:0"
S3_VECTOR_BUCKET_NAME = ""
S3_VECTOR_INDEX_NAME = ""
INPUT_DATASET_PATH = "dataset/occurrences.csv"
MAX_INPUT_DOCUMENTS = "200"

Example - Using Amazon S3 Vectors (Boto3 & LangChain)

This gist is a code snippet prepared as part of a series of YouTube videos by MakeOps.

To watch the content head to this YouTube Video Index.

Dependencise

In order to run these examples you'll need to install the following python dependencies:

dependencies = [
    "boto3>=1.40.11",
    "dotenv>=0.9.9",
    "langchain>=0.3.27",
    "langchain-aws==0.2.30",
    "langchain-community>=0.3.27",
]

With Boto

This file uses only boto3 AWS sdk to perform operations against the S3 vector store.

With LangChain

This file uses only langchain dependecies to perform operations on the S3 vector store.

Example Dataset

The dataset used in the example is the UK Marine Accident Investigation Branch occurrences dataset.

Resources

More Info

Reach out to our team: MakeOps Website

import os
import csv
import boto3
import json
from datetime import date
from dotenv import load_dotenv
load_dotenv()
delimeter = ";"
region_name = os.environ.get('REGION_NAME', 'us-east-1')
model_id = os.environ.get('MODEL_ID')
vector_bucket_name = os.environ.get('S3_VECTOR_BUCKET_NAME')
index_name = os.environ.get('S3_VECTOR_INDEX_NAME')
dataset_path = os.environ.get('INPUT_DATASET_PATH')
max_input_documents = int(os.environ.get('MAX_INPUT_DOCUMENTS', '-1'))
bedrock = boto3.client('bedrock-runtime', region_name=region_name)
s3vectors = boto3.client('s3vectors', region_name=region_name)
def read_from_csv(file_path, limit=-1):
'''Read files from CSV dataset file.'''
count = 0
with open(file_path, 'r') as fp:
columns = fp.readline()
reader = csv.DictReader(fp, fieldnames=columns.split(delimeter), delimiter=delimeter)
for row in reader:
yield row
count += 1
if not limit == -1 and count >= limit:
break
def create_embedding(text):
'''Use bedrock to get the embedding matrix for the text'''
response = bedrock.invoke_model(
modelId=model_id,
body=json.dumps({"inputText": text})
)
response_body = json.loads(response['body'].read())
return response_body["embedding"]
def create_vector(doc):
embedding = create_embedding(doc['Description'])
local_date = date.fromisoformat(doc['Local_Date_Main_Event'])
return {
"key": doc['Occurrence_Id'],
"data": { "float32": embedding },
"metadata": {
"occurrence_id": doc['Occurrence_Id'],
"local_date": doc['Local_Date_Main_Event'],
"local_date_year": local_date.year,
"local_date_month": local_date.month,
"state_affected": doc['Coastal_State_Affected'],
"source_text": doc['Description'],
}
}
def load_documents(file_path):
'''Load documents into the vector store up to a limit.'''
vectors = [create_vector(doc) for doc in read_from_csv(file_path, limit=max_input_documents)]
s3vectors.put_vectors(
vectorBucketName=vector_bucket_name,
indexName=index_name,
vectors=vectors
)
def list_vectors():
'''List the vectors in the index'''
return s3vectors.list_vectors(
vectorBucketName=vector_bucket_name,
indexName=index_name
)['vectors']
def query_documents(query, top_k=3):
'''Query the vector store.'''
query_embedding = create_embedding(query)
response = s3vectors.query_vectors(
vectorBucketName=vector_bucket_name,
indexName=index_name,
queryVector={"float32": query_embedding},
topK=top_k,
returnDistance=True,
returnMetadata=True
)
return response["vectors"]
def query_documents_with_filter(query, year, month, top_k=3):
'''Query the vector store using a give filter for year and month'''
query_embedding = create_embedding(query)
response = s3vectors.query_vectors(
vectorBucketName=vector_bucket_name,
indexName=index_name,
queryVector={"float32": query_embedding},
topK=top_k,
filter={
"$and": [
{"local_date_year": {"$eq": year}},
{"local_date_month": {"$eq": month}}
]
},
returnDistance=True,
returnMetadata=True
)
return response["vectors"]
def delete_vectors():
'''Delete all vectors by id'''
docs = read_from_csv(dataset_path, limit=max_input_documents)
return s3vectors.delete_vectors(
vectorBucketName=vector_bucket_name,
indexName=index_name,
keys=[doc['Occurrence_Id'] for doc in docs]
)
import os
import csv
from datetime import date
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_aws import BedrockEmbeddings
from langchain_aws.vectorstores.s3_vectors import AmazonS3Vectors
load_dotenv()
delimeter = ';'
region_name = os.environ.get('REGION_NAME', 'us-east-1')
model_id = os.environ.get('MODEL_ID')
vector_bucket_name = os.environ.get('S3_VECTOR_BUCKET_NAME')
index_name = os.environ.get('S3_VECTOR_INDEX_NAME')
dataset_path = os.environ.get('INPUT_DATASET_PATH')
max_input_documents = int(os.environ.get('MAX_INPUT_DOCUMENTS', '-1'))
embedding = BedrockEmbeddings(model_id=model_id, region_name=region_name)
vectorstore = AmazonS3Vectors(
vector_bucket_name=vector_bucket_name,
index_name=index_name,
embedding=embedding,
region_name=region_name
)
def read_from_csv(file_path, limit=-1):
'''Read files from CSV dataset file.'''
count = 0
with open(file_path, 'r') as fp:
columns = fp.readline()
reader = csv.DictReader(fp, fieldnames=columns.split(delimeter), delimiter=delimeter)
for row in reader:
yield row
count += 1
if not limit == -1 and count >= limit:
break
def create_document(doc):
'''Create a LangChain document for insertion into vector store.'''
page_content = doc['Description']
local_date = date.fromisoformat(doc['Local_Date_Main_Event'])
metadata = {
"occurrence_id": doc['Occurrence_Id'],
"local_date": doc['Local_Date_Main_Event'],
"local_date_year": local_date.year,
"local_date_month": local_date.month,
"state_affected": doc['Coastal_State_Affected'],
"source_text": doc['Description'],
}
return Document(
id=doc['Occurrence_Id'],
page_content=page_content,
metadata=metadata
)
def load_documents(file_path):
'''Load documents into the vector store up to a limit.'''
documents = [create_document(doc) for doc in read_from_csv(file_path, limit=max_input_documents)]
vectorstore.add_documents(documents)
def query_documents(query, top_k=3):
'''Query the vector store.'''
return vectorstore.similarity_search_with_score(query, k=top_k)
def query_documents_with_filter(query, year, month, top_k=3):
'''Query the vector store.'''
return vectorstore.similarity_search_with_score(query, k=top_k, filters={
"$and": [
{"local_date_year": {"$eq": year}},
{"local_date_month": {"$eq": month}}
]
})
def delete_vectors():
'''Delete all vectors by id'''
docs = read_from_csv(dataset_path, limit=max_input_documents)
return vectorstore.delete(
ids=[doc['Occurrence_Id'] for doc in docs]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment