Skip to content

Instantly share code, notes, and snippets.

@primaprashant
Created August 8, 2025 08:52
Show Gist options
  • Select an option

  • Save primaprashant/f181ed685ae563fd06c49d3d49a8dd9b to your computer and use it in GitHub Desktop.

Select an option

Save primaprashant/f181ed685ae563fd06c49d3d49a8dd9b to your computer and use it in GitHub Desktop.
Python script to generate the summary of a Hacker News using one or more LLMs by calling OpenAI API compatible endpoints.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "openai==1.99.3",
# "python-dotenv==1.1.1",
# ]
# ///
import argparse
import json
import os
import re
import sys
import textwrap
import urllib.error
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Optional
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
def fetch_post_data(post_id):
url = f"https://hn.algolia.com/api/v1/items/{post_id}"
try:
with urllib.request.urlopen(url) as response:
data = response.read()
return json.loads(data)
except urllib.error.HTTPError as e:
if e.code == 404:
print(f"Error: Post with ID {post_id} not found")
else:
print(f"HTTP Error {e.code}: {e.reason}")
return None
except urllib.error.URLError as e:
print(f"Network error: {e.reason}")
return None
except json.JSONDecodeError:
print("Error: Invalid JSON response from API")
return None
def clean_html(text):
if text is None:
return ""
text = re.sub(r"<p>", "\n\n", text)
text = re.sub(r"</p>", "", text)
text = re.sub(r"<br\s*/?>", "\n", text)
text = re.sub(r"<pre><code>(.*?)</code></pre>", r"\1", text, flags=re.DOTALL)
text = re.sub(r"<[^>]+>", "", text)
text = text.replace("&gt;", ">")
text = text.replace("&lt;", "<")
text = text.replace("&amp;", "&")
text = text.replace("&quot;", '"')
text = text.replace("&#x27;", "'")
text = text.replace("&#x2F;", "/")
text = text.replace("&nbsp;", " ")
text = re.sub(r"\n\n+", "\n\n", text)
return text.strip()
def extract_comments_recursive(item, depth=0, comments_list=None):
if comments_list is None:
comments_list = []
if item is None:
return comments_list
author = item.get("author")
text = item.get("text")
if author or text:
author = author if author else "[deleted]"
text = clean_html(text) if text else "[removed]"
if not (author == "[deleted]" and text == "[removed]"):
comments_list.append({"author": author, "text": text, "depth": depth})
children = item.get("children", [])
if children:
for child in children:
extract_comments_recursive(child, depth + 1, comments_list)
return comments_list
def format_comment(author, text, depth):
prefix = f"[L{depth}] {author}: "
lines = text.split("\n")
if not lines:
return prefix + "\n"
formatted_lines = [prefix + lines[0]]
for line in lines[1:]:
formatted_lines.append(line)
return "\n".join(formatted_lines)
def build_comments_string(post_id, comments, post_data):
content = []
content.append(f"Title: {post_data.get('title', 'N/A')}")
content.append(f"Points: {post_data.get('points', 0)}")
content.append(f"Total Comments: {len(comments)}")
content.append(f"Fetched: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
content.append("=" * 50)
content.append("")
if post_data.get("text"):
content.append("Original Post Text:")
content.append(clean_html(post_data.get("text")))
content.append("=" * 50)
content.append("")
for comment in comments:
formatted = format_comment(comment["author"], comment["text"], comment["depth"])
content.append(formatted)
content.append("")
return "\n".join(content)
def get_summarization_prompt() -> str:
return textwrap.dedent("""\
Summarize the themes of the opinions expressed in the comments of a Hacker News post. For each theme, output a markdown header.
Include direct "quotations" (with author attribution) where appropriate. You MUST quote directly from users when crediting them, with double quotes.
Output markdown. Go long. Include a section of quotes that illustrate opinions uncommon in the rest of the piece.
The [L0], [L1], etc. prefixes indicate the depth of the comment in the discussion tree. A comment with [L0] is a root comment, so [L0] is a direct reply to the post, [L1] is a reply to the last [L0] comment, and so on. Don't use these prefixes in your output.
""")
def summarize_with_llm(content: str, model: str, api_key: str, base_url: Optional[str] = None) -> tuple[str, dict]:
client_kwargs = {"api_key": api_key}
if base_url:
client_kwargs["base_url"] = base_url
client = OpenAI(**client_kwargs)
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": get_summarization_prompt()},
{"role": "user", "content": f"{content}"},
],
reasoning_effort="high",
max_completion_tokens=32000,
timeout=600,
)
summary = response.choices[0].message.content
usage = {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens,
}
if response.usage.completion_tokens_details and response.usage.completion_tokens_details.reasoning_tokens:
usage["reasoning_tokens"] = response.usage.completion_tokens_details.reasoning_tokens
return summary, usage
except Exception as e:
print(f"Error calling {model}: {str(e)}")
return None, None
def write_summary(
post_id: str,
post_data: dict,
summary: str,
model: str,
usage: Optional[dict],
comments_count: int,
) -> Path:
summary_filename = Path(f"hn_summary_{post_id}_{model.replace('/', '-')}.md")
with summary_filename.open("w", encoding="utf-8") as f:
f.write("# Hacker News Discussion Summary\n\n")
f.write(f"**Post ID:** {post_id}\n")
f.write(f"**Title:** {post_data.get('title', 'N/A')}\n")
f.write(f"**Points:** {post_data.get('points', 0)}\n")
f.write(f"**Total Comments:** {comments_count}\n")
f.write(f"**Model:** {model}\n")
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
if usage:
f.write("# Token Usage\n\n")
f.write(f"- Prompt tokens: {usage['prompt_tokens']:,}\n")
f.write(f"- Completion tokens: {usage['completion_tokens']:,}\n")
if "reasoning_tokens" in usage:
f.write(f"- Reasoning tokens: {usage['reasoning_tokens']:,}\n")
f.write(f"- Total tokens: {usage['total_tokens']:,}\n\n")
f.write("---\n\n")
f.write(summary)
return summary_filename
def main():
parser = argparse.ArgumentParser(
description="Fetch Hacker News comments for a post and optionally summarize with LLMs"
)
parser.add_argument("post_id", type=int, help="The HN post ID to fetch comments for")
parser.add_argument("-o", "--output", help="Output filename (default: hn_comments_<post_id>.txt)")
parser.add_argument(
"--models",
nargs="+",
help="LLM models to use for summarization (e.g., gpt-4o gpt-4o-mini claude-3-5-sonnet-20241022)",
)
args = parser.parse_args()
post_id = str(args.post_id)
print(f"Fetching post {post_id} from Hacker News...")
post_data = fetch_post_data(post_id)
if post_data is None:
sys.exit(1)
print("Extracting comments...")
comments = extract_comments_recursive(post_data)
if not comments:
print("No comments found for this post")
sys.exit(0)
print(f"Found {len(comments)} comments")
content = build_comments_string(post_id, comments, post_data)
filename = args.output if args.output else f"hn_comments_{post_id}.txt"
with Path(filename).open("w", encoding="utf-8") as f:
f.write(content)
print(f"Comments saved to {filename}")
if args.models:
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")
if not api_key:
print("Warning: OPENAI_API_KEY not found in environment variables or .env file")
print("Skipping LLM summarization")
return
print(f"\nGenerating summaries with {len(args.models)} model(s)...")
for model in args.models:
print(f"\nProcessing with {model}...")
summary, usage = summarize_with_llm(content, model, api_key, base_url)
if summary:
summary_filename = write_summary(
post_id=post_id,
post_data=post_data,
summary=summary,
model=model,
usage=usage,
comments_count=len(comments),
)
print(f" ✓ Summary saved to {summary_filename}")
if usage:
print(f" Tokens used: {usage['total_tokens']:,}")
else:
print(f" ✗ Failed to generate summary with {model}")
if __name__ == "__main__":
main()
@primaprashant
Copy link
Author

This is a standalone Python script and can be run with uv without setting up any virtual environment. it uses high reasoning effort by default.

Download this file, hn_post_summary.py, to your local machine and then to create a summary using gpt-5 model with OpenAI API, run:
export OPENAI_API_KEY=xxx && uv run hn_post_summary.py <HN post id> --models gpt-5

To use it with LiteLLM, and to create a summary using gpt-5 and gemini-2.5-pro, run:
export OPENAI_BASE_URL=<litellm base url> && export OPENAI_API_KEY=<litellm api key> && uv run hn_post_summary.py <HN post id> --models openai/gpt-5 vertex_ai/gemini-2.5-pro

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment