SharathHebbar · September 30, 2024 15:00
diff --git a/new.py b/new.py
 import os
 from langchain.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import AzureOpenAIEmbeddings
 from langchain.vectorstores import Chroma
 from langchain.chains import RetrievalQA
 from langchain.llms import AzureOpenAI

 # Step 1: Load PDF Document
 def load_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    return documents

 # Step 2: Chunk the Document
 def chunk_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    chunks = splitter.split_documents(documents)
    return chunks

 # Step 3: Generate Embeddings One at a Time
 def embed_chunks(chunks):
    embeddings_model = AzureOpenAIEmbeddings(
        azure_deployment="cds_text_embedding_2",
        openai_api_version=os.getenv("AZURE_API_Version")
    )
    
    texts = [chunk.page_content for chunk in chunks]
    embeddings = []
    
    for text in texts:
        # Embed one text at a time to avoid the "Too many inputs" error
        embedding = embeddings_model.embed_documents([text])[0]
        embeddings.append(embedding)
    
    return texts, embeddings

 # Step 4: Store Chunks and Embeddings in Chroma
 def store_in_chroma(texts, embeddings, collection_name="pdf_chunk_collection"):
    chroma_client = Chroma(
        collection_name=collection_name,
        persist_directory="./chroma_storage"  # Optional: specify a directory to persist the data
    )
    
    chroma_client.add_texts(
        texts=texts,
        embeddings=embeddings
    )
    
    return chroma_client

 # Step 5: Set Up the Azure OpenAI for Chat with the Chroma Store
 def setup_qa_with_pdf(chroma_vector_store):
    llm = AzureOpenAI(
        openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        deployment_name="gpt-4",
        openai_api_version=os.getenv("AZURE_API_Version")
    )
    
    retriever = chroma_vector_store.as_retriever()
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever
    )
    
    return qa_chain

 # Step 6: Main Function to Execute the Chat with PDF Workflow
 def main(pdf_path, query):
    # Step 1: Load the PDF
    documents = load_pdf(pdf_path)
    
    # Step 2: Chunk the documents
    chunks = chunk_documents(documents)
    
    # Step 3: Generate embeddings for the chunks one at a time
    texts, embeddings = embed_chunks(chunks)
    
    # Step 4: Store chunks and embeddings in Chroma
    vector_store = store_in_chroma(texts, embeddings)
    
    # Step 5: Set up the QA chain
    qa_chain = setup_qa_with_pdf(vector_store)
    
    # Step 6: Handle user query
    answer = qa_chain.run(query)
    
    # Print or return the answer
    print(f"Answer: {answer}")

 # Example Usage
 if __name__ == "__main__":
    # Specify the path to your PDF file and your query
    pdf_file_path = "your_pdf_file.pdf"
    user_query = "What is the document about?"
    
    # Execute the main function
    main(pdf_file_path, user_query)
	import os
	from langchain.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.embeddings import AzureOpenAIEmbeddings
	from langchain.vectorstores import Chroma
	from langchain.chains import RetrievalQA
	from langchain.llms import AzureOpenAI

	# Step 1: Load PDF Document
	def load_pdf(pdf_path):
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	return documents

	# Step 2: Chunk the Document
	def chunk_documents(documents):
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
	chunks = splitter.split_documents(documents)
	return chunks

	# Step 3: Generate Embeddings One at a Time
	def embed_chunks(chunks):
	embeddings_model = AzureOpenAIEmbeddings(
	azure_deployment="cds_text_embedding_2",
	openai_api_version=os.getenv("AZURE_API_Version")
	)

	texts = [chunk.page_content for chunk in chunks]
	embeddings = []

	for text in texts:
	# Embed one text at a time to avoid the "Too many inputs" error
	embedding = embeddings_model.embed_documents([text])[0]
	embeddings.append(embedding)

	return texts, embeddings

	# Step 4: Store Chunks and Embeddings in Chroma
	def store_in_chroma(texts, embeddings, collection_name="pdf_chunk_collection"):
	chroma_client = Chroma(
	collection_name=collection_name,
	persist_directory="./chroma_storage" # Optional: specify a directory to persist the data
	)

	chroma_client.add_texts(
	texts=texts,
	embeddings=embeddings
	)

	return chroma_client

	# Step 5: Set Up the Azure OpenAI for Chat with the Chroma Store
	def setup_qa_with_pdf(chroma_vector_store):
	llm = AzureOpenAI(
	openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
	deployment_name="gpt-4",
	openai_api_version=os.getenv("AZURE_API_Version")
	)

	retriever = chroma_vector_store.as_retriever()

	qa_chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=retriever
	)

	return qa_chain

	# Step 6: Main Function to Execute the Chat with PDF Workflow
	def main(pdf_path, query):
	# Step 1: Load the PDF
	documents = load_pdf(pdf_path)

	# Step 2: Chunk the documents
	chunks = chunk_documents(documents)

	# Step 3: Generate embeddings for the chunks one at a time
	texts, embeddings = embed_chunks(chunks)

	# Step 4: Store chunks and embeddings in Chroma
	vector_store = store_in_chroma(texts, embeddings)

	# Step 5: Set up the QA chain
	qa_chain = setup_qa_with_pdf(vector_store)

	# Step 6: Handle user query
	answer = qa_chain.run(query)

	# Print or return the answer
	print(f"Answer: {answer}")

	# Example Usage
	if __name__ == "__main__":
	# Specify the path to your PDF file and your query
	pdf_file_path = "your_pdf_file.pdf"
	user_query = "What is the document about?"

	# Execute the main function
	main(pdf_file_path, user_query)
No results found