Created
August 2, 2025 16:45
-
-
Save nuhmanpk/a4b652d20294745a9e6a75d983a670d6 to your computer and use it in GitHub Desktop.
rag using langchain and ollama
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import streamlit as st | |
| import time | |
| from langchain_community.document_loaders import SeleniumURLLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_core.vectorstores import InMemoryVectorStore | |
| from langchain_ollama import OllamaEmbeddings | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_ollama.llms import OllamaLLM | |
| model_name = "llama3.2" | |
| # Initialize session state | |
| st.session_state.setdefault("vector_store", InMemoryVectorStore(OllamaEmbeddings(model=model_name))) | |
| st.session_state.setdefault("last_scraped_url", None) | |
| llm = OllamaLLM(model=model_name) | |
| template = """ | |
| You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. | |
| If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. | |
| Question: {question} | |
| Context: {context} | |
| Answer: | |
| """ | |
| prompt = ChatPromptTemplate.from_template(template) | |
| def load_page(url): | |
| st.toast("π Scraping content from the URL...") | |
| loader = SeleniumURLLoader( | |
| urls=[url], | |
| executable_path="/opt/homebrew/bin/chromedriver", | |
| ) | |
| documents = loader.load() | |
| st.toast("β Page content scraped.") | |
| return documents | |
| def split_text(documents): | |
| st.toast("π§© Splitting text into chunks...") | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, chunk_overlap=200, add_start_index=True | |
| ) | |
| doc_list = text_splitter.split_documents(documents) | |
| st.toast(f"β Split into {len(doc_list)} chunks.") | |
| return doc_list | |
| def index_docs(documents): | |
| st.toast("π§ Updating memory with new content...") | |
| st.session_state.vector_store.add_documents(documents) | |
| st.toast("β Memory updated.") | |
| def retrieve_docs(query): | |
| st.toast("π Searching memory for relevant context...") | |
| results = st.session_state.vector_store.similarity_search(query) | |
| st.toast(f"β Found {len(results)} relevant pieces of context.") | |
| return results | |
| def answer_question(question, context): | |
| st.toast("π¬ Thinking...") | |
| chain = prompt | llm | |
| return chain.invoke({"question": question, "context": context}) | |
| # Streamlit UI | |
| st.title("π·οΈ AI Web Crawler & Q&A") | |
| url = st.text_input("π Enter a URL to scrape:") | |
| # Only scrape if new or different URL | |
| if url and url.strip() and url != st.session_state.last_scraped_url: | |
| try: | |
| documents = load_page(url) | |
| chunked_docs = split_text(documents=documents) | |
| index_docs(chunked_docs) | |
| st.session_state.last_scraped_url = url # Update the session state | |
| except Exception as e: | |
| st.toast(f"β Failed to load page: {e}") | |
| question = st.chat_input("π¬ Ask a question based on the page content:") | |
| if question: | |
| st.chat_message('user').write(question) | |
| retrieved_docs = retrieve_docs(question) | |
| context = "\n\n".join([doc.page_content for doc in retrieved_docs]) | |
| answer = answer_question(question, context) | |
| st.chat_message('assistant').write(answer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment