Skip to content

Instantly share code, notes, and snippets.

@llvtt
Last active May 7, 2016 19:06
Show Gist options
  • Select an option

  • Save llvtt/53799972737f1846c533 to your computer and use it in GitHub Desktop.

Select an option

Save llvtt/53799972737f1846c533 to your computer and use it in GitHub Desktop.
Stream Reddit posts into MongoDB
#!/usr/bin/env python
import datetime
import optparse
import sys
import praw
import praw.helpers
import pymongo
import requests.exceptions
MONGO_HOST = "localhost"
MONGO_PORT = 27017
USER_AGENT_STR = "reddit_to_mongo/0.1"
def convert_to_document(post):
# Retrieve comments if there are any.
comments = {}
if post.num_comments > 0:
comment_list = praw.helpers.flatten_tree(post.comments,
depth_first=True)
comments = {"comments": [
{
"text": comment.body,
"author": {
"id": comment.author.id,
"name": comment.author.name
},
"created": datetime.datetime.fromtimestamp(comment.created)
}
for comment in comment_list
]}
# Just select a few fields for our purposes.
post_doc = {
"_id": post.id,
"title": post.title,
"author": {
"id": post.author.id,
"name": post.author.name
},
"subreddit": post.subreddit.display_name,
"text": post.selftext,
"date": datetime.datetime.fromtimestamp(post.created),
"num_comments": 0 if not comments else len(comments['comments'])
}
post_doc.update(comments)
return post_doc
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option("--mongo-host", default=MONGO_HOST, dest='mongo_host',
help="Hostname where MongoDB is running.")
parser.add_option("--mongo-port", default=MONGO_PORT, dest='mongo_port',
type=int, help="Port on which MongoDB is listening.")
parser.add_option("--reddit-namespace", default='reddit.posts',
dest='reddit_namespace',
help="Reddit data namespace.")
options, _ = parser.parse_args()
mongodb = pymongo.MongoClient(options.mongo_host, options.mongo_port)
reddit_db, reddit_coll = options.reddit_namespace.split('.', 1)
collection = mongodb[reddit_db][reddit_coll]
reddit = praw.Reddit(USER_AGENT_STR)
try:
# Get an infinite stream of all new posts to the front page.
new_posts = praw.helpers.submission_stream(reddit, "all")
for post in new_posts:
# Insert each post into MongoDB.
try:
collection.save(convert_to_document(post))
print("processed: %s..." % post.title[:20])
except requests.exceptions.HTTPError:
# HTTP request to retrieve more information about the post
# returned with a 4xx error.
pass
except AttributeError:
# Post or Comment may have been deleted between retrieving it
# and accessing its fields
pass
except KeyboardInterrupt:
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment