Created
January 20, 2025 22:20
-
-
Save gbaeke/97afb88da56d59e1b6ca460653fc8700 to your computer and use it in GitHub Desktop.
LLM Search with Bing and scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import requests | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| from bs4 import BeautifulSoup | |
| from openai import OpenAI | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import uvicorn | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv() | |
| app = FastAPI() | |
| # Configuration | |
| BING_API_KEY = os.getenv("BING_API_KEY") | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| BING_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search" | |
| # Initialize models | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| client = OpenAI(api_key=OPENAI_API_KEY) | |
| class SearchRequest(BaseModel): | |
| query: str | |
| class AnswerRequest(BaseModel): | |
| query: str | |
| include_scraped: bool = False | |
| def bing_search(query: str): | |
| headers = {"Ocp-Apim-Subscription-Key": BING_API_KEY} | |
| params = {"q": query} | |
| try: | |
| response = requests.get(BING_ENDPOINT, headers=headers, params=params) | |
| response.raise_for_status() | |
| data = response.json() | |
| return [{"url": item["url"], "snippet": item["snippet"]} for item in data["webPages"]["value"]] | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Bing search failed: {str(e)}") | |
| def scrape_url(url: str): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style", "nav", "footer", "header"]): | |
| script.decompose() | |
| text = soup.get_text(separator='\n', strip=True) | |
| return text | |
| except Exception as e: | |
| print(f"Error scraping {url}: {str(e)}") | |
| return "" | |
| def chunk_text(text: str, chunk_size=500): | |
| words = text.split() | |
| chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
| return chunks | |
| def get_relevant_chunks(query: str, chunks: list, top_k=3): | |
| query_embedding = embedding_model.encode([query]) | |
| chunk_embeddings = embedding_model.encode(chunks) | |
| similarities = cosine_similarity(query_embedding, chunk_embeddings)[0] | |
| top_indices = np.argsort(similarities)[-top_k:][::-1] | |
| return [chunks[i] for i in top_indices] | |
| @app.post("/search") | |
| async def search(request: SearchRequest): | |
| results = bing_search(request.query) | |
| return {"results": results} | |
| @app.post("/ask") | |
| async def ask_question(request: AnswerRequest): | |
| # Step 1: Get search results | |
| search_results = bing_search(request.query) | |
| # Initialize context with search results | |
| context = "Search Results:\n" + "\n".join([f"URL: {result['url']}\nSummary: {result['snippet']}\n" for result in search_results]) | |
| if request.include_scraped: | |
| # Step 2: Scrape and process top 3 results | |
| scraped_data = [] | |
| for result in search_results[:3]: | |
| content = scrape_url(result["url"]) | |
| if content: | |
| scraped_data.append(content) | |
| # Step 3: Chunk and find relevant content | |
| all_chunks = [] | |
| for content in scraped_data: | |
| all_chunks.extend(chunk_text(content)) | |
| if all_chunks: | |
| relevant_chunks = get_relevant_chunks(request.query, all_chunks) | |
| # Append relevant chunks to context | |
| for chunk in relevant_chunks: | |
| context += f"\nRelevant content:\n{chunk}" | |
| # Step 4: Build LLM prompt | |
| prompt = f"""Answer the following question: {request.query} | |
| Context from web search: | |
| {context if context else 'No additional context available'} | |
| Provide a comprehensive answer based on the context and your knowledge. Do not answer if the context is not sufficient.""" | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| answer = response.choices[0].message.content.strip() | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"OpenAI API error: {str(e)}") | |
| return { | |
| "answer": answer, | |
| "search_results": search_results, | |
| "used_context": context if request.include_scraped else None | |
| } | |
| # Add CORS middleware if needed | |
| from fastapi.middleware.cors import CORSMiddleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment