-
-
Save zmwangx/6bb9ddb5d904b7275c62 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| ############################### README ############################### | |
| # External dependencies: | |
| # * libmagic | |
| # * python-dateutil | |
| # * python-magic | |
| # * requests | |
| # | |
| # The shell command call at the end was tested with GNU date from GNU | |
| # coreutils. Other implementations might not work (for instance, BSD | |
| # date is not compatible). However, you may comment out the last | |
| # section without affecting the download. | |
| # | |
| # The "last_update_timestamp" is reserved for the update feature, | |
| # which is not implemented yet. | |
| ###################################################################### | |
| import commands | |
| import json | |
| import os | |
| import sys | |
| import urllib | |
| import dateutil.parser | |
| import magic | |
| import requests | |
| ########################### CUSTOMIZE THIS ########################### | |
| page_id = "567045406742962" | |
| access_token = "app_id|app_secret" # see https://developers.facebook.com/docs/facebook-login/access-tokens#apptokens | |
| dest = os.path.expanduser("~/img/sns/apink-official-facebook") | |
| website_title = "apink-official-facebook" | |
| ###################################################################### | |
| if not os.path.exists(dest): | |
| os.makedirs(dest) | |
| # read last update time, if it is available | |
| last_update_record = dest + "/last_update_timestamp" | |
| if os.path.exists(last_update_record): | |
| f = open(last_update_record, "r") | |
| last_update_timestamp = f.readline() | |
| f.close() | |
| last_update_time = dateutil.parser.parse(last_update_timestamp) | |
| else: | |
| last_update_time = dateutil.parser.parse("1970-01-01T00:00+00:00") | |
| # this function makes an API call with only an access_token (which | |
| # could be just app-id|app-secret) | |
| def fb_public_call(endpoint, params, access_token): | |
| params["access_token"] = access_token | |
| response = requests.get("https://graph.facebook.com/" + endpoint, | |
| params=params) | |
| return response.json() | |
| # this function downloads a photo | |
| # return codes are defined below | |
| SUCCESS = 0 | |
| FAILED_DOWNLOAD = 1 | |
| UNRECOGNIZED_MIME = 2 | |
| OLD_PHOTO = 255 # photo older than last update time | |
| def handle_photo(photo, album_id): | |
| # print information | |
| photo_id = photo["id"] | |
| time = dateutil.parser.parse(photo["created_time"]) | |
| if time < last_update_time: | |
| return OLD_PHOTO | |
| time_print = time.strftime("%b %d, %Y") | |
| time_full = time.strftime("%Y%m%d%H%M%S") | |
| original_image = photo["images"][0] | |
| height = original_image["height"] | |
| width = original_image["width"] | |
| format_string = "date: %s id: %s size: %sx%s" | |
| print format_string % (time_print, photo_id, width, | |
| height) | |
| # download file | |
| source_uri = original_image["source"] | |
| filename = time_full + "-" + website_title + "-" + \ | |
| album_id + "-" + photo_id | |
| filepath = dest + "/" + filename | |
| urllib.urlretrieve(source_uri, filepath) | |
| # identify mime type and attach extension | |
| if os.path.exists(filepath): | |
| mime = magic.from_file(filepath, mime=True) | |
| if mime == "image/gif": | |
| newfilepath = filepath + ".gif" | |
| elif mime == "image/jpeg": | |
| newfilepath = filepath + ".jpg" | |
| elif mime == "image/png": | |
| newfilepath = filepath + ".png" | |
| else: | |
| err = filepath + ": error: " + \ | |
| "unrecgonized image type\n" | |
| sys.stderr.write(err) | |
| return UNRECOGNIZED_MIME | |
| os.rename(filepath, newfilepath) | |
| return SUCCESS | |
| else: | |
| # donwload failed for whatever reason | |
| err = "error: " + filename + " failed to " + \ | |
| "downloaded from " + source_uri + "\n" | |
| sys.stderr.write(err) | |
| return FAILED_DOWNLOAD | |
| # this function handles an album, i.e., download newly added photos | |
| # since the last update | |
| def handle_album(album): | |
| # print album info | |
| album_id = album["id"] | |
| format_string = "downloading album \"%s\" " + \ | |
| "(album id: %s; photo count: %s)" | |
| print format_string % (album["name"], album_id, | |
| album["count"]) | |
| print "-" * 80 | |
| # retrieve photos in the album | |
| photos_response = fb_public_call(album["id"] + "/photos", | |
| params, access_token) | |
| while True: | |
| for photo in photos_response["data"]: | |
| if handle_photo(photo, album_id) == OLD_PHOTO: | |
| # already encountered old photo in this album | |
| # no need to look further into the past | |
| return | |
| if "next" in photos_response["paging"]: | |
| next_uri = photos_response["paging"]["next"] | |
| photos_response = requests.get(next_uri).json() | |
| else: | |
| break | |
| params = {} | |
| # retrieve albums | |
| albums_response = fb_public_call(page_id + "/albums", params, | |
| access_token); | |
| while True: | |
| for album in albums_response["data"]: | |
| handle_album(album) | |
| if "next" in albums_response["paging"]: | |
| next_uri = albums_response["paging"]["next"] | |
| albums_response = requests.get(next_uri).json() | |
| else: | |
| break | |
| # update feature yet to be implemented | |
| # create a file "last_update_timestamp" for future use | |
| f = open(last_update_record, "w") | |
| f.write(commands.getoutput("date -u --iso-8601=seconds")) | |
| f.close() |
Does this work on any pages anymore? Didn't seem to work on page ID 394278667320706 or other relatively public personal pages. Just trying to debug for Shane's use case.
@shaneholloman @leotam Hey guys. First, commenting on a gist is not the best way to get help. In case you don't know, there are no notifications for comments on gists, so if I haven't been looking through my older gists by chance and saw a greater-than-zero number of comments here, I would have never known.
Secondly, regarding the issues you're having: this script is more than a year old, and I quitted scraping Facebook stuff at least nine months ago. Things have changed. Just try the REST requests with Facebook's API Explorer, or a REST client like Paw, or just good ol' curl, and you'll see that /222407964522124/albums returns 400, and although /394278667320706/albums works, using the album endpoint on the albums returned just give me a truncated response with only created_time, name and id, without actual info of photos. Same goes for a few other public pages I tried. So there's nothing I can do; the simplest access token isn't good enough anymore.
What I would suggest is to do a full blown OAuth. I'm not going to try, so I won't guarantee success.
Since this thing was broken, I built my own. It's still a work in progress but it currently downloads photos according to albums.
How come I need an access token for that ? It's a public page
not running
Traceback (most recent call last):
File "fb-dl.py", line 140, in <module>
handle_album(album)
File "fb-dl.py", line 114, in handle_album
album["count"])
KeyError: 'count'
@Mouradif: Since it's the only way you're getting access to Facebook's API.
I have a problem:
Traceback (most recent call last):
File "fb-dl.py", line 138, in
for album in albums_response["data"]:
KeyError: 'data'
What can I do?
looks like it maybe an issue on a facebook "Company page" here's an example: http://findmyfbid.com/success/222407964522124
Yup, same community pages as well