Created
August 8, 2025 08:52
-
-
Save primaprashant/f181ed685ae563fd06c49d3d49a8dd9b to your computer and use it in GitHub Desktop.
Python script to generate the summary of a Hacker News using one or more LLMs by calling OpenAI API compatible endpoints.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "openai==1.99.3", | |
| # "python-dotenv==1.1.1", | |
| # ] | |
| # /// | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| import sys | |
| import textwrap | |
| import urllib.error | |
| import urllib.request | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Optional | |
| from dotenv import load_dotenv | |
| from openai import OpenAI | |
| load_dotenv() | |
| def fetch_post_data(post_id): | |
| url = f"https://hn.algolia.com/api/v1/items/{post_id}" | |
| try: | |
| with urllib.request.urlopen(url) as response: | |
| data = response.read() | |
| return json.loads(data) | |
| except urllib.error.HTTPError as e: | |
| if e.code == 404: | |
| print(f"Error: Post with ID {post_id} not found") | |
| else: | |
| print(f"HTTP Error {e.code}: {e.reason}") | |
| return None | |
| except urllib.error.URLError as e: | |
| print(f"Network error: {e.reason}") | |
| return None | |
| except json.JSONDecodeError: | |
| print("Error: Invalid JSON response from API") | |
| return None | |
| def clean_html(text): | |
| if text is None: | |
| return "" | |
| text = re.sub(r"<p>", "\n\n", text) | |
| text = re.sub(r"</p>", "", text) | |
| text = re.sub(r"<br\s*/?>", "\n", text) | |
| text = re.sub(r"<pre><code>(.*?)</code></pre>", r"\1", text, flags=re.DOTALL) | |
| text = re.sub(r"<[^>]+>", "", text) | |
| text = text.replace(">", ">") | |
| text = text.replace("<", "<") | |
| text = text.replace("&", "&") | |
| text = text.replace(""", '"') | |
| text = text.replace("'", "'") | |
| text = text.replace("/", "/") | |
| text = text.replace(" ", " ") | |
| text = re.sub(r"\n\n+", "\n\n", text) | |
| return text.strip() | |
| def extract_comments_recursive(item, depth=0, comments_list=None): | |
| if comments_list is None: | |
| comments_list = [] | |
| if item is None: | |
| return comments_list | |
| author = item.get("author") | |
| text = item.get("text") | |
| if author or text: | |
| author = author if author else "[deleted]" | |
| text = clean_html(text) if text else "[removed]" | |
| if not (author == "[deleted]" and text == "[removed]"): | |
| comments_list.append({"author": author, "text": text, "depth": depth}) | |
| children = item.get("children", []) | |
| if children: | |
| for child in children: | |
| extract_comments_recursive(child, depth + 1, comments_list) | |
| return comments_list | |
| def format_comment(author, text, depth): | |
| prefix = f"[L{depth}] {author}: " | |
| lines = text.split("\n") | |
| if not lines: | |
| return prefix + "\n" | |
| formatted_lines = [prefix + lines[0]] | |
| for line in lines[1:]: | |
| formatted_lines.append(line) | |
| return "\n".join(formatted_lines) | |
| def build_comments_string(post_id, comments, post_data): | |
| content = [] | |
| content.append(f"Title: {post_data.get('title', 'N/A')}") | |
| content.append(f"Points: {post_data.get('points', 0)}") | |
| content.append(f"Total Comments: {len(comments)}") | |
| content.append(f"Fetched: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| content.append("=" * 50) | |
| content.append("") | |
| if post_data.get("text"): | |
| content.append("Original Post Text:") | |
| content.append(clean_html(post_data.get("text"))) | |
| content.append("=" * 50) | |
| content.append("") | |
| for comment in comments: | |
| formatted = format_comment(comment["author"], comment["text"], comment["depth"]) | |
| content.append(formatted) | |
| content.append("") | |
| return "\n".join(content) | |
| def get_summarization_prompt() -> str: | |
| return textwrap.dedent("""\ | |
| Summarize the themes of the opinions expressed in the comments of a Hacker News post. For each theme, output a markdown header. | |
| Include direct "quotations" (with author attribution) where appropriate. You MUST quote directly from users when crediting them, with double quotes. | |
| Output markdown. Go long. Include a section of quotes that illustrate opinions uncommon in the rest of the piece. | |
| The [L0], [L1], etc. prefixes indicate the depth of the comment in the discussion tree. A comment with [L0] is a root comment, so [L0] is a direct reply to the post, [L1] is a reply to the last [L0] comment, and so on. Don't use these prefixes in your output. | |
| """) | |
| def summarize_with_llm(content: str, model: str, api_key: str, base_url: Optional[str] = None) -> tuple[str, dict]: | |
| client_kwargs = {"api_key": api_key} | |
| if base_url: | |
| client_kwargs["base_url"] = base_url | |
| client = OpenAI(**client_kwargs) | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": get_summarization_prompt()}, | |
| {"role": "user", "content": f"{content}"}, | |
| ], | |
| reasoning_effort="high", | |
| max_completion_tokens=32000, | |
| timeout=600, | |
| ) | |
| summary = response.choices[0].message.content | |
| usage = { | |
| "prompt_tokens": response.usage.prompt_tokens, | |
| "completion_tokens": response.usage.completion_tokens, | |
| "total_tokens": response.usage.total_tokens, | |
| } | |
| if response.usage.completion_tokens_details and response.usage.completion_tokens_details.reasoning_tokens: | |
| usage["reasoning_tokens"] = response.usage.completion_tokens_details.reasoning_tokens | |
| return summary, usage | |
| except Exception as e: | |
| print(f"Error calling {model}: {str(e)}") | |
| return None, None | |
| def write_summary( | |
| post_id: str, | |
| post_data: dict, | |
| summary: str, | |
| model: str, | |
| usage: Optional[dict], | |
| comments_count: int, | |
| ) -> Path: | |
| summary_filename = Path(f"hn_summary_{post_id}_{model.replace('/', '-')}.md") | |
| with summary_filename.open("w", encoding="utf-8") as f: | |
| f.write("# Hacker News Discussion Summary\n\n") | |
| f.write(f"**Post ID:** {post_id}\n") | |
| f.write(f"**Title:** {post_data.get('title', 'N/A')}\n") | |
| f.write(f"**Points:** {post_data.get('points', 0)}\n") | |
| f.write(f"**Total Comments:** {comments_count}\n") | |
| f.write(f"**Model:** {model}\n") | |
| f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") | |
| if usage: | |
| f.write("# Token Usage\n\n") | |
| f.write(f"- Prompt tokens: {usage['prompt_tokens']:,}\n") | |
| f.write(f"- Completion tokens: {usage['completion_tokens']:,}\n") | |
| if "reasoning_tokens" in usage: | |
| f.write(f"- Reasoning tokens: {usage['reasoning_tokens']:,}\n") | |
| f.write(f"- Total tokens: {usage['total_tokens']:,}\n\n") | |
| f.write("---\n\n") | |
| f.write(summary) | |
| return summary_filename | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Fetch Hacker News comments for a post and optionally summarize with LLMs" | |
| ) | |
| parser.add_argument("post_id", type=int, help="The HN post ID to fetch comments for") | |
| parser.add_argument("-o", "--output", help="Output filename (default: hn_comments_<post_id>.txt)") | |
| parser.add_argument( | |
| "--models", | |
| nargs="+", | |
| help="LLM models to use for summarization (e.g., gpt-4o gpt-4o-mini claude-3-5-sonnet-20241022)", | |
| ) | |
| args = parser.parse_args() | |
| post_id = str(args.post_id) | |
| print(f"Fetching post {post_id} from Hacker News...") | |
| post_data = fetch_post_data(post_id) | |
| if post_data is None: | |
| sys.exit(1) | |
| print("Extracting comments...") | |
| comments = extract_comments_recursive(post_data) | |
| if not comments: | |
| print("No comments found for this post") | |
| sys.exit(0) | |
| print(f"Found {len(comments)} comments") | |
| content = build_comments_string(post_id, comments, post_data) | |
| filename = args.output if args.output else f"hn_comments_{post_id}.txt" | |
| with Path(filename).open("w", encoding="utf-8") as f: | |
| f.write(content) | |
| print(f"Comments saved to {filename}") | |
| if args.models: | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| base_url = os.getenv("OPENAI_BASE_URL") | |
| if not api_key: | |
| print("Warning: OPENAI_API_KEY not found in environment variables or .env file") | |
| print("Skipping LLM summarization") | |
| return | |
| print(f"\nGenerating summaries with {len(args.models)} model(s)...") | |
| for model in args.models: | |
| print(f"\nProcessing with {model}...") | |
| summary, usage = summarize_with_llm(content, model, api_key, base_url) | |
| if summary: | |
| summary_filename = write_summary( | |
| post_id=post_id, | |
| post_data=post_data, | |
| summary=summary, | |
| model=model, | |
| usage=usage, | |
| comments_count=len(comments), | |
| ) | |
| print(f" ✓ Summary saved to {summary_filename}") | |
| if usage: | |
| print(f" Tokens used: {usage['total_tokens']:,}") | |
| else: | |
| print(f" ✗ Failed to generate summary with {model}") | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a standalone Python script and can be run with uv without setting up any virtual environment. it uses
highreasoning effort by default.Download this file,
hn_post_summary.py, to your local machine and then to create a summary usinggpt-5model with OpenAI API, run:export OPENAI_API_KEY=xxx && uv run hn_post_summary.py <HN post id> --models gpt-5To use it with LiteLLM, and to create a summary using
gpt-5andgemini-2.5-pro, run:export OPENAI_BASE_URL=<litellm base url> && export OPENAI_API_KEY=<litellm api key> && uv run hn_post_summary.py <HN post id> --models openai/gpt-5 vertex_ai/gemini-2.5-pro