nuhmanpk · August 2, 2025 16:45
diff --git a/rag.py b/rag.py
 import streamlit as st
 import time

 from langchain_community.document_loaders import SeleniumURLLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.vectorstores import InMemoryVectorStore
 from langchain_ollama import OllamaEmbeddings
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_ollama.llms import OllamaLLM

 model_name = "llama3.2"

 # Initialize session state
 st.session_state.setdefault("vector_store", InMemoryVectorStore(OllamaEmbeddings(model=model_name)))
 st.session_state.setdefault("last_scraped_url", None)

 llm = OllamaLLM(model=model_name)

 template = """
 You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
 If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
 Question: {question} 
 Context: {context} 
 Answer:
 """

 prompt = ChatPromptTemplate.from_template(template)


 def load_page(url):
    st.toast("🔍 Scraping content from the URL...")
    loader = SeleniumURLLoader(
        urls=[url],
        executable_path="/opt/homebrew/bin/chromedriver",
    )
    documents = loader.load()
    st.toast("✅ Page content scraped.")
    return documents


 def split_text(documents):
    st.toast("🧩 Splitting text into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    doc_list = text_splitter.split_documents(documents)
    st.toast(f"✅ Split into {len(doc_list)} chunks.")
    return doc_list


 def index_docs(documents):
    st.toast("🧠 Updating memory with new content...")
    st.session_state.vector_store.add_documents(documents)
    st.toast("✅ Memory updated.")


 def retrieve_docs(query):
    st.toast("🔎 Searching memory for relevant context...")
    results = st.session_state.vector_store.similarity_search(query)
    st.toast(f"✅ Found {len(results)} relevant pieces of context.")
    return results


 def answer_question(question, context):
    st.toast("💬 Thinking...")
    chain = prompt | llm
    return chain.invoke({"question": question, "context": context})


 # Streamlit UI
 st.title("🕷️ AI Web Crawler & Q&A")

 url = st.text_input("🌐 Enter a URL to scrape:")

 # Only scrape if new or different URL
 if url and url.strip() and url != st.session_state.last_scraped_url:
    try:
        documents = load_page(url)
        chunked_docs = split_text(documents=documents)
        index_docs(chunked_docs)
        st.session_state.last_scraped_url = url  # Update the session state
    except Exception as e:
        st.toast(f"❌ Failed to load page: {e}")

 question = st.chat_input("💬 Ask a question based on the page content:")

 if question:
    st.chat_message('user').write(question)
    retrieved_docs = retrieve_docs(question)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    answer = answer_question(question, context)
    st.chat_message('assistant').write(answer)
	import streamlit as st
	import time

	from langchain_community.document_loaders import SeleniumURLLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_core.vectorstores import InMemoryVectorStore
	from langchain_ollama import OllamaEmbeddings
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_ollama.llms import OllamaLLM

	model_name = "llama3.2"

	# Initialize session state
	st.session_state.setdefault("vector_store", InMemoryVectorStore(OllamaEmbeddings(model=model_name)))
	st.session_state.setdefault("last_scraped_url", None)

	llm = OllamaLLM(model=model_name)

	template = """
	You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
	If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
	Question: {question}
	Context: {context}
	Answer:
	"""

	prompt = ChatPromptTemplate.from_template(template)


	def load_page(url):
	st.toast("🔍 Scraping content from the URL...")
	loader = SeleniumURLLoader(
	urls=[url],
	executable_path="/opt/homebrew/bin/chromedriver",
	)
	documents = loader.load()
	st.toast("✅ Page content scraped.")
	return documents


	def split_text(documents):
	st.toast("🧩 Splitting text into chunks...")
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=200, add_start_index=True
	)
	doc_list = text_splitter.split_documents(documents)
	st.toast(f"✅ Split into {len(doc_list)} chunks.")
	return doc_list


	def index_docs(documents):
	st.toast("🧠 Updating memory with new content...")
	st.session_state.vector_store.add_documents(documents)
	st.toast("✅ Memory updated.")


	def retrieve_docs(query):
	st.toast("🔎 Searching memory for relevant context...")
	results = st.session_state.vector_store.similarity_search(query)
	st.toast(f"✅ Found {len(results)} relevant pieces of context.")
	return results


	def answer_question(question, context):
	st.toast("💬 Thinking...")
	chain = prompt \| llm
	return chain.invoke({"question": question, "context": context})


	# Streamlit UI
	st.title("🕷️ AI Web Crawler & Q&A")

	url = st.text_input("🌐 Enter a URL to scrape:")

	# Only scrape if new or different URL
	if url and url.strip() and url != st.session_state.last_scraped_url:
	try:
	documents = load_page(url)
	chunked_docs = split_text(documents=documents)
	index_docs(chunked_docs)
	st.session_state.last_scraped_url = url # Update the session state
	except Exception as e:
	st.toast(f"❌ Failed to load page: {e}")

	question = st.chat_input("💬 Ask a question based on the page content:")

	if question:
	st.chat_message('user').write(question)
	retrieved_docs = retrieve_docs(question)
	context = "\n\n".join([doc.page_content for doc in retrieved_docs])
	answer = answer_question(question, context)
	st.chat_message('assistant').write(answer)
No results found