Created
August 26, 2024 20:01
-
-
Save everythingpython/5181b0e3c9957617598d4b446609ba01 to your computer and use it in GitHub Desktop.
Scraping HN articles of interest and then using LLM to get their topics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import os | |
| import urllib.request | |
| from data_models import AIResponseList | |
| import requests | |
| from anthropic import Anthropic | |
| from openai import OpenAI | |
| from strings import * | |
| from keys import * | |
| anthropic = Anthropic(api_key=anthropic) | |
| openai_k = openai_key | |
| url = hacker_news_latest_url | |
| data = urllib.request.urlopen(url) | |
| data = json.loads(data.read().decode("utf-8")) | |
| existing_urls = set() | |
| if os.path.exists("topic_analysis.json"): | |
| with open("topic_analysis.json", "r") as json_file: | |
| existing_data = json.load(json_file) | |
| if "responses" in existing_data: | |
| existing_urls = {item["url"] for item in existing_data["responses"]} | |
| print(f"Before = {len(data)}") | |
| d_ict = {"name": [], "url": []} | |
| for i in list(data)[1:10]: | |
| url = f"{hacker_news_get_article_url(i)} | |
| data = urllib.request.urlopen(url) | |
| json_data = json.loads(data.read().decode("utf-8")) | |
| d_ict["name"].append(json_data.get("title", "")) | |
| d_ict["url"].append(json_data.get("url", "")) | |
| try: | |
| content_full = [] | |
| try: | |
| for name, url in zip(d_ict["name"], d_ict["url"]): | |
| if url not in existing_urls: | |
| content_d = {} | |
| print(f"Processing {url}") | |
| response = requests.get(url) | |
| content = response.text[1:1000] | |
| content_d["name"] = name | |
| content_d["url"] = url | |
| content_d["content"] = content | |
| content_full.append(content_d) | |
| except Exception as e: | |
| print("Skipping {}".format(url)) | |
| pass | |
| print(f"Total URLs to process - {len(content_full)} ") | |
| response_model = AIResponseList | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": f"{get_sys_prompt(content_full)}" | |
| } | |
| ] | |
| client = OpenAI(api_key=openai_k) | |
| completion = client.beta.chat.completions.parse( | |
| model="gpt-4o-2024-08-06", messages=messages, response_format=response_model | |
| ) | |
| # The response will now be forced to be either "Python" or "GenAI" | |
| topic = completion.choices[0].message.parsed | |
| if topic: | |
| # Convert the topic object to a dictionary | |
| topic_dict = topic.model_dump() | |
| # Convert the dictionary to JSON | |
| topic_json = json.dumps(topic_dict, indent=2) | |
| json_filename = f"topic_analysis.json" | |
| # Read existing JSON data from the file | |
| existing_data = {} | |
| if os.path.exists(json_filename): | |
| with open(json_filename, "r") as json_file: | |
| existing_data = json.load(json_file) | |
| # Filter out irrelevant topics | |
| new_responses = [ | |
| response | |
| for response in topic_dict["responses"] | |
| if response["topic"].lower() != "irrelevant" | |
| ] | |
| topic_dict["responses"] = new_responses | |
| # Merge existing data with new data | |
| if "responses" in existing_data: | |
| existing_data["responses"].extend(topic_dict["responses"]) | |
| else: | |
| existing_data = topic_dict | |
| # Convert the merged data back to JSON | |
| topic_json = json.dumps(existing_data, indent=2) | |
| # Write the merged data to the JSON file | |
| with open(json_filename, "w") as json_file: | |
| json_file.write(topic_json) | |
| print(f"JSON output written to {json_filename}") | |
| print(f"The content is:\n {topic}") | |
| except Exception as e: | |
| print(f"Error processing {url}: {str(e)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment