Created
September 30, 2024 09:15
-
-
Save SharathHebbar/500b4921feef8cd14bafb3cab6a5f673 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import AzureOpenAIEmbeddings | |
| from langchain.vectorstores import Chroma | |
| # Step 1: Load PDF Document | |
| def load_pdf(pdf_path): | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| return documents | |
| # Step 2: Chunk the Document | |
| def chunk_documents(documents): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50) | |
| chunks = splitter.split_documents(documents) | |
| return chunks | |
| # Step 3: Generate Embeddings in Batches | |
| def embed_chunks_in_batches(chunks, batch_size=1): | |
| embeddings_model = AzureOpenAIEmbeddings( | |
| azure_deployment="cds_text_embedding_2", | |
| openai_api_version=os.getenv("AZURE_API_Version") | |
| ) | |
| embeddings = [] | |
| for i in range(0, len(chunks), batch_size): | |
| batch = chunks[i:i + batch_size] | |
| batch_embeddings = [embeddings_model.embed_documents([chunk.page_content])[0] for chunk in batch] | |
| embeddings.extend(batch_embeddings) | |
| return embeddings | |
| # Step 4: Store in Chroma | |
| def store_in_chroma(chunks, embeddings, collection_name="pdf_chunk_collection"): | |
| texts = [chunk.page_content for chunk in chunks] | |
| ids = [f"chunk_{i}" for i in range(len(chunks))] | |
| chroma_client = Chroma( | |
| collection_name=collection_name, | |
| persist_directory="./chroma_storage" # Optional: specify a directory to persist the data | |
| ) | |
| chroma_client.add_texts( | |
| texts=texts, | |
| ids=ids, | |
| embeddings=embeddings | |
| ) | |
| return chroma_client | |
| # Step 5: Query the Chroma Vector Store | |
| def query_chroma(query_text, vector_store): | |
| results = vector_store.similarity_search(query_text, n_results=5) | |
| return results | |
| # Step 6: Main Function to Execute the Workflow | |
| def main(pdf_path, query): | |
| # Load the PDF | |
| documents = load_pdf(pdf_path) | |
| # Chunk the documents | |
| chunks = chunk_documents(documents) | |
| # Generate embeddings for the chunks | |
| embeddings = embed_chunks_in_batches(chunks, batch_size=1) | |
| # Store chunks and embeddings in Chroma | |
| vector_store = store_in_chroma(chunks, embeddings) | |
| # Query the Chroma Vector Store | |
| results = query_chroma(query, vector_store) | |
| # Display the results | |
| for i, result in enumerate(results): | |
| print(f"Result {i + 1}:") | |
| print(result.page_content) | |
| print("\n---\n") | |
| # Example Usage | |
| if __name__ == "__main__": | |
| # Specify the path to your PDF file and your query | |
| pdf_file_path = "your_pdf_file.pdf" | |
| user_query = "What is the document about?" | |
| # Execute the main function | |
| main(pdf_file_path, user_query) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment