primaprashant · August 8, 2025 08:52 · primaprashant · Aug 8, 2025
diff --git a/hn_post_summary.py b/hn_post_summary.py
 # /// script
 # requires-python = ">=3.12"
 # dependencies = [
 #   "openai==1.99.3",
 #   "python-dotenv==1.1.1",
 # ]
 # ///

 import argparse
 import json
 import os
 import re
 import sys
 import textwrap
 import urllib.error
 import urllib.request
 from datetime import datetime
 from pathlib import Path
 from typing import Optional

 from dotenv import load_dotenv
 from openai import OpenAI

 load_dotenv()


 def fetch_post_data(post_id):
    url = f"https://hn.algolia.com/api/v1/items/{post_id}"

    try:
        with urllib.request.urlopen(url) as response:
            data = response.read()
            return json.loads(data)
    except urllib.error.HTTPError as e:
        if e.code == 404:
            print(f"Error: Post with ID {post_id} not found")
        else:
            print(f"HTTP Error {e.code}: {e.reason}")
        return None
    except urllib.error.URLError as e:
        print(f"Network error: {e.reason}")
        return None
    except json.JSONDecodeError:
        print("Error: Invalid JSON response from API")
        return None


 def clean_html(text):
    if text is None:
        return ""

    text = re.sub(r"<p>", "\n\n", text)
    text = re.sub(r"</p>", "", text)

    text = re.sub(r"<br\s*/?>", "\n", text)

    text = re.sub(r"<pre><code>(.*?)</code></pre>", r"\1", text, flags=re.DOTALL)

    text = re.sub(r"<[^>]+>", "", text)

    text = text.replace("&gt;", ">")
    text = text.replace("&lt;", "<")
    text = text.replace("&amp;", "&")
    text = text.replace("&quot;", '"')
    text = text.replace("&#x27;", "'")
    text = text.replace("&#x2F;", "/")
    text = text.replace("&nbsp;", " ")

    text = re.sub(r"\n\n+", "\n\n", text)

    return text.strip()


 def extract_comments_recursive(item, depth=0, comments_list=None):
    if comments_list is None:
        comments_list = []

    if item is None:
        return comments_list

    author = item.get("author")
    text = item.get("text")

    if author or text:
        author = author if author else "[deleted]"
        text = clean_html(text) if text else "[removed]"

        if not (author == "[deleted]" and text == "[removed]"):
            comments_list.append({"author": author, "text": text, "depth": depth})

    children = item.get("children", [])
    if children:
        for child in children:
            extract_comments_recursive(child, depth + 1, comments_list)

    return comments_list


 def format_comment(author, text, depth):
    prefix = f"[L{depth}] {author}: "

    lines = text.split("\n")
    if not lines:
        return prefix + "\n"

    formatted_lines = [prefix + lines[0]]

    for line in lines[1:]:
        formatted_lines.append(line)

    return "\n".join(formatted_lines)


 def build_comments_string(post_id, comments, post_data):
    content = []

    content.append(f"Title: {post_data.get('title', 'N/A')}")
    content.append(f"Points: {post_data.get('points', 0)}")
    content.append(f"Total Comments: {len(comments)}")
    content.append(f"Fetched: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    content.append("=" * 50)
    content.append("")

    if post_data.get("text"):
        content.append("Original Post Text:")
        content.append(clean_html(post_data.get("text")))
        content.append("=" * 50)
        content.append("")

    for comment in comments:
        formatted = format_comment(comment["author"], comment["text"], comment["depth"])
        content.append(formatted)
        content.append("")

    return "\n".join(content)


 def get_summarization_prompt() -> str:
    return textwrap.dedent("""\
        Summarize the themes of the opinions expressed in the comments of a Hacker News post. For each theme, output a markdown header.
        Include direct "quotations" (with author attribution) where appropriate. You MUST quote directly from users when crediting them, with double quotes.
        Output markdown. Go long. Include a section of quotes that illustrate opinions uncommon in the rest of the piece.
        The [L0], [L1], etc. prefixes indicate the depth of the comment in the discussion tree. A comment with [L0] is a root comment, so [L0] is a direct reply to the post, [L1] is a reply to the last [L0] comment, and so on. Don't use these prefixes in your output.
    """)


 def summarize_with_llm(content: str, model: str, api_key: str, base_url: Optional[str] = None) -> tuple[str, dict]:
    client_kwargs = {"api_key": api_key}
    if base_url:
        client_kwargs["base_url"] = base_url

    client = OpenAI(**client_kwargs)

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": get_summarization_prompt()},
                {"role": "user", "content": f"{content}"},
            ],
            reasoning_effort="high",
            max_completion_tokens=32000,
            timeout=600,
        )

        summary = response.choices[0].message.content
        usage = {
            "prompt_tokens": response.usage.prompt_tokens,
            "completion_tokens": response.usage.completion_tokens,
            "total_tokens": response.usage.total_tokens,
        }
        if response.usage.completion_tokens_details and response.usage.completion_tokens_details.reasoning_tokens:
            usage["reasoning_tokens"] = response.usage.completion_tokens_details.reasoning_tokens

        return summary, usage
    except Exception as e:
        print(f"Error calling {model}: {str(e)}")
        return None, None


 def write_summary(
    post_id: str,
    post_data: dict,
    summary: str,
    model: str,
    usage: Optional[dict],
    comments_count: int,
 ) -> Path:
    summary_filename = Path(f"hn_summary_{post_id}_{model.replace('/', '-')}.md")

    with summary_filename.open("w", encoding="utf-8") as f:
        f.write("# Hacker News Discussion Summary\n\n")
        f.write(f"**Post ID:** {post_id}\n")
        f.write(f"**Title:** {post_data.get('title', 'N/A')}\n")
        f.write(f"**Points:** {post_data.get('points', 0)}\n")
        f.write(f"**Total Comments:** {comments_count}\n")
        f.write(f"**Model:** {model}\n")
        f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        if usage:
            f.write("# Token Usage\n\n")
            f.write(f"- Prompt tokens: {usage['prompt_tokens']:,}\n")
            f.write(f"- Completion tokens: {usage['completion_tokens']:,}\n")
            if "reasoning_tokens" in usage:
                f.write(f"- Reasoning tokens: {usage['reasoning_tokens']:,}\n")
            f.write(f"- Total tokens: {usage['total_tokens']:,}\n\n")

        f.write("---\n\n")
        f.write(summary)

    return summary_filename


 def main():
    parser = argparse.ArgumentParser(
        description="Fetch Hacker News comments for a post and optionally summarize with LLMs"
    )
    parser.add_argument("post_id", type=int, help="The HN post ID to fetch comments for")
    parser.add_argument("-o", "--output", help="Output filename (default: hn_comments_<post_id>.txt)")
    parser.add_argument(
        "--models",
        nargs="+",
        help="LLM models to use for summarization (e.g., gpt-4o gpt-4o-mini claude-3-5-sonnet-20241022)",
    )

    args = parser.parse_args()
    post_id = str(args.post_id)

    print(f"Fetching post {post_id} from Hacker News...")
    post_data = fetch_post_data(post_id)

    if post_data is None:
        sys.exit(1)

    print("Extracting comments...")
    comments = extract_comments_recursive(post_data)

    if not comments:
        print("No comments found for this post")
        sys.exit(0)

    print(f"Found {len(comments)} comments")

    content = build_comments_string(post_id, comments, post_data)

    filename = args.output if args.output else f"hn_comments_{post_id}.txt"
    with Path(filename).open("w", encoding="utf-8") as f:
        f.write(content)

    print(f"Comments saved to {filename}")

    if args.models:
        api_key = os.getenv("OPENAI_API_KEY")
        base_url = os.getenv("OPENAI_BASE_URL")

        if not api_key:
            print("Warning: OPENAI_API_KEY not found in environment variables or .env file")
            print("Skipping LLM summarization")
            return

        print(f"\nGenerating summaries with {len(args.models)} model(s)...")

        for model in args.models:
            print(f"\nProcessing with {model}...")
            summary, usage = summarize_with_llm(content, model, api_key, base_url)

            if summary:
                summary_filename = write_summary(
                    post_id=post_id,
                    post_data=post_data,
                    summary=summary,
                    model=model,
                    usage=usage,
                    comments_count=len(comments),
                )
                print(f"  ✓ Summary saved to {summary_filename}")
                if usage:
                    print(f"    Tokens used: {usage['total_tokens']:,}")
            else:
                print(f"  ✗ Failed to generate summary with {model}")


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "openai==1.99.3",
	# "python-dotenv==1.1.1",
	# ]
	# ///

	import argparse
	import json
	import os
	import re
	import sys
	import textwrap
	import urllib.error
	import urllib.request
	from datetime import datetime
	from pathlib import Path
	from typing import Optional

	from dotenv import load_dotenv
	from openai import OpenAI

	load_dotenv()


	def fetch_post_data(post_id):
	url = f"https://hn.algolia.com/api/v1/items/{post_id}"

	try:
	with urllib.request.urlopen(url) as response:
	data = response.read()
	return json.loads(data)
	except urllib.error.HTTPError as e:
	if e.code == 404:
	print(f"Error: Post with ID {post_id} not found")
	else:
	print(f"HTTP Error {e.code}: {e.reason}")
	return None
	except urllib.error.URLError as e:
	print(f"Network error: {e.reason}")
	return None
	except json.JSONDecodeError:
	print("Error: Invalid JSON response from API")
	return None


	def clean_html(text):
	if text is None:
	return ""

	text = re.sub(r"<p>", "\n\n", text)
	text = re.sub(r"</p>", "", text)

	text = re.sub(r"<br\s*/?>", "\n", text)

	text = re.sub(r"<pre><code>(.*?)</code></pre>", r"\1", text, flags=re.DOTALL)

	text = re.sub(r"<[^>]+>", "", text)

	text = text.replace(">", ">")
	text = text.replace("<", "<")
	text = text.replace("&", "&")
	text = text.replace(""", '"')
	text = text.replace("'", "'")
	text = text.replace("/", "/")
	text = text.replace(" ", " ")

	text = re.sub(r"\n\n+", "\n\n", text)

	return text.strip()


	def extract_comments_recursive(item, depth=0, comments_list=None):
	if comments_list is None:
	comments_list = []

	if item is None:
	return comments_list

	author = item.get("author")
	text = item.get("text")

	if author or text:
	author = author if author else "[deleted]"
	text = clean_html(text) if text else "[removed]"

	if not (author == "[deleted]" and text == "[removed]"):
	comments_list.append({"author": author, "text": text, "depth": depth})

	children = item.get("children", [])
	if children:
	for child in children:
	extract_comments_recursive(child, depth + 1, comments_list)

	return comments_list


	def format_comment(author, text, depth):
	prefix = f"[L{depth}] {author}: "

	lines = text.split("\n")
	if not lines:
	return prefix + "\n"

	formatted_lines = [prefix + lines[0]]

	for line in lines[1:]:
	formatted_lines.append(line)

	return "\n".join(formatted_lines)


	def build_comments_string(post_id, comments, post_data):
	content = []

	content.append(f"Title: {post_data.get('title', 'N/A')}")
	content.append(f"Points: {post_data.get('points', 0)}")
	content.append(f"Total Comments: {len(comments)}")
	content.append(f"Fetched: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	content.append("=" * 50)
	content.append("")

	if post_data.get("text"):
	content.append("Original Post Text:")
	content.append(clean_html(post_data.get("text")))
	content.append("=" * 50)
	content.append("")

	for comment in comments:
	formatted = format_comment(comment["author"], comment["text"], comment["depth"])
	content.append(formatted)
	content.append("")

	return "\n".join(content)


	def get_summarization_prompt() -> str:
	return textwrap.dedent("""\
	Summarize the themes of the opinions expressed in the comments of a Hacker News post. For each theme, output a markdown header.
	Include direct "quotations" (with author attribution) where appropriate. You MUST quote directly from users when crediting them, with double quotes.
	Output markdown. Go long. Include a section of quotes that illustrate opinions uncommon in the rest of the piece.
	The [L0], [L1], etc. prefixes indicate the depth of the comment in the discussion tree. A comment with [L0] is a root comment, so [L0] is a direct reply to the post, [L1] is a reply to the last [L0] comment, and so on. Don't use these prefixes in your output.
	""")


	def summarize_with_llm(content: str, model: str, api_key: str, base_url: Optional[str] = None) -> tuple[str, dict]:
	client_kwargs = {"api_key": api_key}
	if base_url:
	client_kwargs["base_url"] = base_url

	client = OpenAI(**client_kwargs)

	try:
	response = client.chat.completions.create(
	model=model,
	messages=[
	{"role": "system", "content": get_summarization_prompt()},
	{"role": "user", "content": f"{content}"},
	],
	reasoning_effort="high",
	max_completion_tokens=32000,
	timeout=600,
	)

	summary = response.choices[0].message.content
	usage = {
	"prompt_tokens": response.usage.prompt_tokens,
	"completion_tokens": response.usage.completion_tokens,
	"total_tokens": response.usage.total_tokens,
	}
	if response.usage.completion_tokens_details and response.usage.completion_tokens_details.reasoning_tokens:
	usage["reasoning_tokens"] = response.usage.completion_tokens_details.reasoning_tokens

	return summary, usage
	except Exception as e:
	print(f"Error calling {model}: {str(e)}")
	return None, None


	def write_summary(
	post_id: str,
	post_data: dict,
	summary: str,
	model: str,
	usage: Optional[dict],
	comments_count: int,
	) -> Path:
	summary_filename = Path(f"hn_summary_{post_id}_{model.replace('/', '-')}.md")

	with summary_filename.open("w", encoding="utf-8") as f:
	f.write("# Hacker News Discussion Summary\n\n")
	f.write(f"Post ID: {post_id}\n")
	f.write(f"Title: {post_data.get('title', 'N/A')}\n")
	f.write(f"Points: {post_data.get('points', 0)}\n")
	f.write(f"Total Comments: {comments_count}\n")
	f.write(f"Model: {model}\n")
	f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

	if usage:
	f.write("# Token Usage\n\n")
	f.write(f"- Prompt tokens: {usage['prompt_tokens']:,}\n")
	f.write(f"- Completion tokens: {usage['completion_tokens']:,}\n")
	if "reasoning_tokens" in usage:
	f.write(f"- Reasoning tokens: {usage['reasoning_tokens']:,}\n")
	f.write(f"- Total tokens: {usage['total_tokens']:,}\n\n")

	f.write("---\n\n")
	f.write(summary)

	return summary_filename


	def main():
	parser = argparse.ArgumentParser(
	description="Fetch Hacker News comments for a post and optionally summarize with LLMs"
	)
	parser.add_argument("post_id", type=int, help="The HN post ID to fetch comments for")
	parser.add_argument("-o", "--output", help="Output filename (default: hn_comments_<post_id>.txt)")
	parser.add_argument(
	"--models",
	nargs="+",
	help="LLM models to use for summarization (e.g., gpt-4o gpt-4o-mini claude-3-5-sonnet-20241022)",
	)

	args = parser.parse_args()
	post_id = str(args.post_id)

	print(f"Fetching post {post_id} from Hacker News...")
	post_data = fetch_post_data(post_id)

	if post_data is None:
	sys.exit(1)

	print("Extracting comments...")
	comments = extract_comments_recursive(post_data)

	if not comments:
	print("No comments found for this post")
	sys.exit(0)

	print(f"Found {len(comments)} comments")

	content = build_comments_string(post_id, comments, post_data)

	filename = args.output if args.output else f"hn_comments_{post_id}.txt"
	with Path(filename).open("w", encoding="utf-8") as f:
	f.write(content)

	print(f"Comments saved to {filename}")

	if args.models:
	api_key = os.getenv("OPENAI_API_KEY")
	base_url = os.getenv("OPENAI_BASE_URL")

	if not api_key:
	print("Warning: OPENAI_API_KEY not found in environment variables or .env file")
	print("Skipping LLM summarization")
	return

	print(f"\nGenerating summaries with {len(args.models)} model(s)...")

	for model in args.models:
	print(f"\nProcessing with {model}...")
	summary, usage = summarize_with_llm(content, model, api_key, base_url)

	if summary:
	summary_filename = write_summary(
	post_id=post_id,
	post_data=post_data,
	summary=summary,
	model=model,
	usage=usage,
	comments_count=len(comments),
	)
	print(f" ✓ Summary saved to {summary_filename}")
	if usage:
	print(f" Tokens used: {usage['total_tokens']:,}")
	else:
	print(f" ✗ Failed to generate summary with {model}")


	if __name__ == "__main__":
	main()
No results found