Created
February 13, 2017 13:22
-
-
Save jesrui/50294f4cac0d0fb28d20118c965d2ab8 to your computer and use it in GitHub Desktop.
download a hacker news thread from firebaseio.com and convert it to mbox format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/env python | |
| # sample usage: | |
| # $ hn-thread2mbox.py 13606863 > 13606863.mbox | |
| # downloads from hacker-news.firebaseio.com the thread with id=13606863 | |
| # and dumps it to stdout in mbox format. | |
| import urllib.request | |
| import json | |
| import time | |
| import sys | |
| HNARCHIVE='https://hacker-news.firebaseio.com/v0/item/{}.json' | |
| def download_kid(post_id): | |
| req = urllib.request.Request(url=HNARCHIVE.format(post_id)) | |
| ret = urllib.request.urlopen(req) | |
| # print('retcode = ', ret.getcode()) | |
| if ret.getcode() != 200: | |
| raise Exception('Failed to download post: response code: {} {}'. | |
| format(ret.getcode(), ret.reason)) | |
| body = ret.read() | |
| body = json.loads(body.decode()) | |
| return body | |
| # the posts of the thread | |
| thread = [] | |
| # key: id, value: parent, used to locate an item in its story | |
| item_ids = dict() | |
| def collect_posts(post): | |
| thread.append(post) | |
| parent = post.get('parent') | |
| item_ids[post['id']] = parent | |
| def dump_as_email(post): | |
| date = time.gmtime(post['time']) | |
| date = time.strftime("%a, %d %b %Y %T %z", date) | |
| #print("date",date) | |
| fields = { | |
| 'id': post['id'], | |
| 'subject': post.get('title', | |
| post.get('deleted') == True and '[deleted]' or 'Re:'), | |
| 'text': post.get('text', ''), | |
| 'url': post.get('url', ''), | |
| 'by': post.get('by', 'unknown'), | |
| 'date': date, | |
| } | |
| email = """From | |
| Message-ID: <{id}@hndump> | |
| From: {by} <{by}@hndump> | |
| Subject: {subject} | |
| Date: {date} | |
| Mime-Version: 1.0 | |
| Content-Type: text/html; charset=utf-8 | |
| """.format(**fields) | |
| parent = post.get('parent') | |
| if parent: | |
| email += "In-Reply-To: <{}@hndump>\n".format(parent) | |
| parents = [parent] | |
| while True: | |
| p = item_ids.get(parents[-1]) | |
| if p is None or p == 0: | |
| break | |
| parents.append(p) | |
| email += "References: {}\n".format( | |
| " ".join(["<{}@hndump>".format(p) for p in parents])) | |
| email += "X-HackerNews-Link: https://news.ycombinator.com/item?id={}\n".format(fields['id']) | |
| if post['type'] == 'comment': | |
| email += "\n<html>{}</html>\n".format(fields['text']) | |
| else: # story | |
| email += """ | |
| <html><a href="{url}" rel="nofollow">{url}</a><p>{text}</html> | |
| """.format(**fields) | |
| print(email) | |
| def visit_thread(post_id, visit_fn): | |
| #print('kid', post_id) | |
| body = download_kid(post_id) | |
| visit_fn(body) | |
| kids = body.get('kids') | |
| if kids: | |
| #print('kids', kids) | |
| for kid in kids: | |
| visit_thread(kid,visit_fn) | |
| def main(argv): | |
| post_id = int(argv[0]) | |
| visit_thread(post_id, collect_posts) | |
| # DEBUG only | |
| # with open('thread_13606863.json', 'r') as f: | |
| # thread = json.load(f) | |
| for post in thread: | |
| dump_as_email(post) | |
| if __name__ == '__main__': | |
| main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment