Skip to content

Instantly share code, notes, and snippets.

@gbaeke
Created January 20, 2025 22:20
Show Gist options
  • Select an option

  • Save gbaeke/97afb88da56d59e1b6ca460653fc8700 to your computer and use it in GitHub Desktop.

Select an option

Save gbaeke/97afb88da56d59e1b6ca460653fc8700 to your computer and use it in GitHub Desktop.
LLM Search with Bing and scraping
import os
import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from bs4 import BeautifulSoup
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import uvicorn
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
app = FastAPI()
# Configuration
BING_API_KEY = os.getenv("BING_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
BING_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
# Initialize models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
client = OpenAI(api_key=OPENAI_API_KEY)
class SearchRequest(BaseModel):
query: str
class AnswerRequest(BaseModel):
query: str
include_scraped: bool = False
def bing_search(query: str):
headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY}
params = {"q": query}
try:
response = requests.get(BING_ENDPOINT, headers=headers, params=params)
response.raise_for_status()
data = response.json()
return [{"url": item["url"], "snippet": item["snippet"]} for item in data["webPages"]["value"]]
except Exception as e:
raise HTTPException(status_code=500, detail=f"Bing search failed: {str(e)}")
def scrape_url(url: str):
try:
response = requests.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style", "nav", "footer", "header"]):
script.decompose()
text = soup.get_text(separator='\n', strip=True)
return text
except Exception as e:
print(f"Error scraping {url}: {str(e)}")
return ""
def chunk_text(text: str, chunk_size=500):
words = text.split()
chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
def get_relevant_chunks(query: str, chunks: list, top_k=3):
query_embedding = embedding_model.encode([query])
chunk_embeddings = embedding_model.encode(chunks)
similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
top_indices = np.argsort(similarities)[-top_k:][::-1]
return [chunks[i] for i in top_indices]
@app.post("/search")
async def search(request: SearchRequest):
results = bing_search(request.query)
return {"results": results}
@app.post("/ask")
async def ask_question(request: AnswerRequest):
# Step 1: Get search results
search_results = bing_search(request.query)
# Initialize context with search results
context = "Search Results:\n" + "\n".join([f"URL: {result['url']}\nSummary: {result['snippet']}\n" for result in search_results])
if request.include_scraped:
# Step 2: Scrape and process top 3 results
scraped_data = []
for result in search_results[:3]:
content = scrape_url(result["url"])
if content:
scraped_data.append(content)
# Step 3: Chunk and find relevant content
all_chunks = []
for content in scraped_data:
all_chunks.extend(chunk_text(content))
if all_chunks:
relevant_chunks = get_relevant_chunks(request.query, all_chunks)
# Append relevant chunks to context
for chunk in relevant_chunks:
context += f"\nRelevant content:\n{chunk}"
# Step 4: Build LLM prompt
prompt = f"""Answer the following question: {request.query}
Context from web search:
{context if context else 'No additional context available'}
Provide a comprehensive answer based on the context and your knowledge. Do not answer if the context is not sufficient."""
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content.strip()
except Exception as e:
raise HTTPException(status_code=500, detail=f"OpenAI API error: {str(e)}")
return {
"answer": answer,
"search_results": search_results,
"used_context": context if request.include_scraped else None
}
# Add CORS middleware if needed
from fastapi.middleware.cors import CORSMiddleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment