Skip to content

Instantly share code, notes, and snippets.

@SharathHebbar
Last active September 30, 2024 15:00
Show Gist options
  • Select an option

  • Save SharathHebbar/9aaf6fdfebb2c816dd9179b692b32746 to your computer and use it in GitHub Desktop.

Select an option

Save SharathHebbar/9aaf6fdfebb2c816dd9179b692b32746 to your computer and use it in GitHub Desktop.
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
# Step 1: Load PDF Document
def load_pdf(pdf_path):
loader = PyPDFLoader(pdf_path)
documents = loader.load()
return documents
# Step 2: Chunk the Document
def chunk_documents(documents):
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunks = splitter.split_documents(documents)
return chunks
# Step 3: Generate Embeddings One at a Time
def embed_chunks(chunks):
embeddings_model = AzureOpenAIEmbeddings(
azure_deployment="cds_text_embedding_2",
openai_api_version=os.getenv("AZURE_API_Version")
)
texts = [chunk.page_content for chunk in chunks]
embeddings = []
for text in texts:
# Embed one text at a time to avoid the "Too many inputs" error
embedding = embeddings_model.embed_documents([text])[0]
embeddings.append(embedding)
return texts, embeddings
# Step 4: Store Chunks and Embeddings in Chroma
def store_in_chroma(texts, embeddings, collection_name="pdf_chunk_collection"):
chroma_client = Chroma(
collection_name=collection_name,
persist_directory="./chroma_storage" # Optional: specify a directory to persist the data
)
chroma_client.add_texts(
texts=texts,
embeddings=embeddings
)
return chroma_client
# Step 5: Set Up the Azure OpenAI for Chat with the Chroma Store
def setup_qa_with_pdf(chroma_vector_store):
llm = AzureOpenAI(
openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
deployment_name="gpt-4",
openai_api_version=os.getenv("AZURE_API_Version")
)
retriever = chroma_vector_store.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever
)
return qa_chain
# Step 6: Main Function to Execute the Chat with PDF Workflow
def main(pdf_path, query):
# Step 1: Load the PDF
documents = load_pdf(pdf_path)
# Step 2: Chunk the documents
chunks = chunk_documents(documents)
# Step 3: Generate embeddings for the chunks one at a time
texts, embeddings = embed_chunks(chunks)
# Step 4: Store chunks and embeddings in Chroma
vector_store = store_in_chroma(texts, embeddings)
# Step 5: Set up the QA chain
qa_chain = setup_qa_with_pdf(vector_store)
# Step 6: Handle user query
answer = qa_chain.run(query)
# Print or return the answer
print(f"Answer: {answer}")
# Example Usage
if __name__ == "__main__":
# Specify the path to your PDF file and your query
pdf_file_path = "your_pdf_file.pdf"
user_query = "What is the document about?"
# Execute the main function
main(pdf_file_path, user_query)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment