Last active
September 30, 2024 15:00
-
-
Save SharathHebbar/9aaf6fdfebb2c816dd9179b692b32746 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import AzureOpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| from langchain.chains import RetrievalQA | |
| from langchain.llms import AzureOpenAI | |
| # Step 1: Load PDF Document | |
| def load_pdf(pdf_path): | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| return documents | |
| # Step 2: Chunk the Document | |
| def chunk_documents(documents): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50) | |
| chunks = splitter.split_documents(documents) | |
| return chunks | |
| # Step 3: Generate Embeddings One at a Time | |
| def embed_chunks(chunks): | |
| embeddings_model = AzureOpenAIEmbeddings( | |
| azure_deployment="cds_text_embedding_2", | |
| openai_api_version=os.getenv("AZURE_API_Version") | |
| ) | |
| texts = [chunk.page_content for chunk in chunks] | |
| embeddings = [] | |
| for text in texts: | |
| # Embed one text at a time to avoid the "Too many inputs" error | |
| embedding = embeddings_model.embed_documents([text])[0] | |
| embeddings.append(embedding) | |
| return texts, embeddings | |
| # Step 4: Store Chunks and Embeddings in Chroma | |
| def store_in_chroma(texts, embeddings, collection_name="pdf_chunk_collection"): | |
| chroma_client = Chroma( | |
| collection_name=collection_name, | |
| persist_directory="./chroma_storage" # Optional: specify a directory to persist the data | |
| ) | |
| chroma_client.add_texts( | |
| texts=texts, | |
| embeddings=embeddings | |
| ) | |
| return chroma_client | |
| # Step 5: Set Up the Azure OpenAI for Chat with the Chroma Store | |
| def setup_qa_with_pdf(chroma_vector_store): | |
| llm = AzureOpenAI( | |
| openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"), | |
| deployment_name="gpt-4", | |
| openai_api_version=os.getenv("AZURE_API_Version") | |
| ) | |
| retriever = chroma_vector_store.as_retriever() | |
| qa_chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=retriever | |
| ) | |
| return qa_chain | |
| # Step 6: Main Function to Execute the Chat with PDF Workflow | |
| def main(pdf_path, query): | |
| # Step 1: Load the PDF | |
| documents = load_pdf(pdf_path) | |
| # Step 2: Chunk the documents | |
| chunks = chunk_documents(documents) | |
| # Step 3: Generate embeddings for the chunks one at a time | |
| texts, embeddings = embed_chunks(chunks) | |
| # Step 4: Store chunks and embeddings in Chroma | |
| vector_store = store_in_chroma(texts, embeddings) | |
| # Step 5: Set up the QA chain | |
| qa_chain = setup_qa_with_pdf(vector_store) | |
| # Step 6: Handle user query | |
| answer = qa_chain.run(query) | |
| # Print or return the answer | |
| print(f"Answer: {answer}") | |
| # Example Usage | |
| if __name__ == "__main__": | |
| # Specify the path to your PDF file and your query | |
| pdf_file_path = "your_pdf_file.pdf" | |
| user_query = "What is the document about?" | |
| # Execute the main function | |
| main(pdf_file_path, user_query) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment