Skip to content

Instantly share code, notes, and snippets.

@jesrui
Created February 13, 2017 13:22
Show Gist options
  • Select an option

  • Save jesrui/50294f4cac0d0fb28d20118c965d2ab8 to your computer and use it in GitHub Desktop.

Select an option

Save jesrui/50294f4cac0d0fb28d20118c965d2ab8 to your computer and use it in GitHub Desktop.
download a hacker news thread from firebaseio.com and convert it to mbox format
#!/bin/env python
# sample usage:
# $ hn-thread2mbox.py 13606863 > 13606863.mbox
# downloads from hacker-news.firebaseio.com the thread with id=13606863
# and dumps it to stdout in mbox format.
import urllib.request
import json
import time
import sys
HNARCHIVE='https://hacker-news.firebaseio.com/v0/item/{}.json'
def download_kid(post_id):
req = urllib.request.Request(url=HNARCHIVE.format(post_id))
ret = urllib.request.urlopen(req)
# print('retcode = ', ret.getcode())
if ret.getcode() != 200:
raise Exception('Failed to download post: response code: {} {}'.
format(ret.getcode(), ret.reason))
body = ret.read()
body = json.loads(body.decode())
return body
# the posts of the thread
thread = []
# key: id, value: parent, used to locate an item in its story
item_ids = dict()
def collect_posts(post):
thread.append(post)
parent = post.get('parent')
item_ids[post['id']] = parent
def dump_as_email(post):
date = time.gmtime(post['time'])
date = time.strftime("%a, %d %b %Y %T %z", date)
#print("date",date)
fields = {
'id': post['id'],
'subject': post.get('title',
post.get('deleted') == True and '[deleted]' or 'Re:'),
'text': post.get('text', ''),
'url': post.get('url', ''),
'by': post.get('by', 'unknown'),
'date': date,
}
email = """From
Message-ID: <{id}@hndump>
From: {by} <{by}@hndump>
Subject: {subject}
Date: {date}
Mime-Version: 1.0
Content-Type: text/html; charset=utf-8
""".format(**fields)
parent = post.get('parent')
if parent:
email += "In-Reply-To: <{}@hndump>\n".format(parent)
parents = [parent]
while True:
p = item_ids.get(parents[-1])
if p is None or p == 0:
break
parents.append(p)
email += "References: {}\n".format(
" ".join(["<{}@hndump>".format(p) for p in parents]))
email += "X-HackerNews-Link: https://news.ycombinator.com/item?id={}\n".format(fields['id'])
if post['type'] == 'comment':
email += "\n<html>{}</html>\n".format(fields['text'])
else: # story
email += """
<html><a href="{url}" rel="nofollow">{url}</a><p>{text}</html>
""".format(**fields)
print(email)
def visit_thread(post_id, visit_fn):
#print('kid', post_id)
body = download_kid(post_id)
visit_fn(body)
kids = body.get('kids')
if kids:
#print('kids', kids)
for kid in kids:
visit_thread(kid,visit_fn)
def main(argv):
post_id = int(argv[0])
visit_thread(post_id, collect_posts)
# DEBUG only
# with open('thread_13606863.json', 'r') as f:
# thread = json.load(f)
for post in thread:
dump_as_email(post)
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment