-
-
Save f0ster/ab3b8bc748c0779a53ceed11d46b1303 to your computer and use it in GitHub Desktop.
| #https://gist.github.com/Chandler/fb7a070f52883849de35 SEE HERE | |
| # MIT License | |
| # Copyright (c) 2016 Chandler Abraham | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| # SOFTWARE. | |
| from slacker import Slacker | |
| import json | |
| import argparse | |
| import os | |
| import sys, time | |
| class RetryError(Exception): | |
| pass | |
| def retryloop(attempts, timeout): | |
| starttime = time.time() | |
| success = set() | |
| for i in range(attempts): | |
| success.add(True) | |
| yield success.clear | |
| if success: | |
| return | |
| if time.time() > starttime + timeout: | |
| break | |
| # raise RetryError | |
| # This script finds all channels, private channels and direct messages | |
| # that your user participates in, downloads the complete history for | |
| # those converations and writes each conversation out to seperate json files. | |
| # | |
| # This user centric history gathering is nice because the official slack data exporter | |
| # only exports public channels. | |
| # | |
| # PS, this only works if your slack team has a paid account which allows for unlimited history. | |
| # | |
| # PPS, this use of the API is blessed by Slack. | |
| # https://get.slack.help/hc/en-us/articles/204897248 | |
| # " If you want to export the contents of your own private groups and direct messages | |
| # please see our API documentation." | |
| # | |
| # get your slack user token at the bottom of this page | |
| # https://api.slack.com/web | |
| # | |
| # dependencies: | |
| # pip install slacker # https://github.com/os/slacker | |
| # | |
| # usage examples | |
| # python slack_history.py --token='123token' | |
| # python slack_history.py --token='123token' --dryRun=True | |
| # python slack_history.py --token='123token' --skipDirectMessages | |
| # python slack_history.py --token='123token' --skipDirectMessages --skipPrivateChannels | |
| # fetches the complete message history for a channel/group/im | |
| # | |
| # pageableObject could be: | |
| # slack.channel | |
| # slack.groups | |
| # slack.im | |
| # | |
| # channelId is the id of the channel/group/im you want to download history for. | |
| def getHistory(pageableObject, channelId, pageSize = 100): | |
| messages = [] | |
| lastTimestamp = None | |
| while(True): | |
| response = pageableObject.history( | |
| channel = channelId, | |
| latest = lastTimestamp, | |
| oldest = 0, | |
| count = pageSize | |
| ).body | |
| messages.extend(response['messages']) | |
| if (response['has_more'] == True): | |
| lastTimestamp = messages[-1]['ts'] # -1 means last element in a list | |
| else: | |
| break | |
| return messages | |
| def mkdir(directory): | |
| if not os.path.exists(directory): | |
| os.makedirs(directory) | |
| # fetch and write history for all public channels | |
| def getChannels(slack, dryRun): | |
| channels = slack.channels.list().body['channels'] | |
| print("\nfound channels: ") | |
| for channel in channels: | |
| print(channel['name']) | |
| if not dryRun: | |
| parentDir = "channels" | |
| mkdir(parentDir) | |
| for channel in channels: | |
| print("getting history for channel {0}".format(channel['name'])) | |
| fileName = "{parent}/{file}.json".format(parent = parentDir, file = channel['name']) | |
| for retry in retryloop(10000, timeout=2): | |
| try: | |
| messages = getHistory(slack.channels, channel['id']) | |
| channelInfo = slack.channels.info(channel['id']).body['channel'] | |
| with open(fileName, 'w') as outFile: | |
| print("writing {0} records to {1}".format(len(messages), fileName)) | |
| json.dump({'channel_info': channelInfo, 'messages': messages }, outFile, indent=4) | |
| except Exception: | |
| retry() | |
| # fetch and write history for all direct message conversations | |
| # also known as IMs in the slack API. | |
| def getDirectMessages(slack, ownerId, userIdNameMap, dryRun): | |
| dms = slack.im.list().body['ims'] | |
| print("\nfound direct messages (1:1) with the following users:") | |
| for dm in dms: | |
| print(userIdNameMap.get(dm['user'], dm['user'] + " (name unknown)")) | |
| if not dryRun: | |
| parentDir = "direct_messages" | |
| mkdir(parentDir) | |
| for dm in dms: | |
| name = userIdNameMap.get(dm['user'], dm['user'] + " (name unknown)") | |
| print("getting history for direct messages with {0}".format(name)) | |
| fileName = "{parent}/{file}.json".format(parent = parentDir, file = name) | |
| for retry in retryloop(10000, timeout=2): | |
| try: | |
| messages = getHistory(slack.im, dm['id']) | |
| channelInfo = {'members': [dm['user'], ownerId]} | |
| with open(fileName, 'w') as outFile: | |
| print("writing {0} records to {1}".format(len(messages), fileName)) | |
| json.dump({'channel_info': channelInfo, 'messages': messages}, outFile, indent=4) | |
| except Exception: | |
| retry() | |
| # fetch and write history for all private channels | |
| # also known as groups in the slack API. | |
| def getPrivateChannels(slack, dryRun): | |
| groups = slack.groups.list().body['groups'] | |
| print("\nfound private channels:") | |
| for group in groups: | |
| print("{0}: ({1} members)".format(group['name'], len(group['members']))) | |
| if not dryRun: | |
| parentDir = "private_channels" | |
| mkdir(parentDir) | |
| for group in groups: | |
| messages = [] | |
| print("getting history for private channel {0} with id {1}".format(group['name'], group['id'])) | |
| fileName = "{parent}/{file}.json".format(parent = parentDir, file = group['name']) | |
| for retry in retryloop(10000, timeout=2): | |
| try: | |
| messages = getHistory(slack.groups, group['id']) | |
| channelInfo = slack.groups.info(group['id']).body['group'] | |
| with open(fileName, 'w') as outFile: | |
| print("writing {0} records to {1}".format(len(messages), fileName)) | |
| json.dump({'channel_info': channelInfo, 'messages': messages}, outFile, indent=4) | |
| except Exception: | |
| retry() | |
| # fetch all users for the channel and return a map userId -> userName | |
| def getUserMap(slack): | |
| #get all users in the slack organization | |
| users = slack.users.list().body['members'] | |
| userIdNameMap = {} | |
| for user in users: | |
| userIdNameMap[user['id']] = user['name'] | |
| print("found {0} users ".format(len(users))) | |
| return userIdNameMap | |
| # get basic info about the slack channel to ensure the authentication token works | |
| def doTestAuth(slack): | |
| testAuth = slack.auth.test().body | |
| teamName = testAuth['team'] | |
| currentUser = testAuth['user'] | |
| print("Successfully authenticated for team {0} and user {1} ".format(teamName, currentUser)) | |
| return testAuth | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description='download slack history') | |
| parser.add_argument('--token', help="an api token for a slack user") | |
| parser.add_argument( | |
| '--dryRun', | |
| action='store_true', | |
| default=False, | |
| help="if dryRun is true, don't fetch/write history only get channel names") | |
| parser.add_argument( | |
| '--skipPrivateChannels', | |
| action='store_true', | |
| default=False, | |
| help="skip fetching history for private channels") | |
| parser.add_argument( | |
| '--skipChannels', | |
| action='store_true', | |
| default=False, | |
| help="skip fetching history for channels") | |
| parser.add_argument( | |
| '--skipDirectMessages', | |
| action='store_true', | |
| default=False, | |
| help="skip fetching history for directMessages") | |
| args = parser.parse_args() | |
| slack = Slacker(args.token) | |
| testAuth = doTestAuth(slack) | |
| userIdNameMap = getUserMap(slack) | |
| dryRun = args.dryRun | |
| if not dryRun: | |
| with open('metadata.json', 'w') as outFile: | |
| print("writing metadata") | |
| metadata = { | |
| 'auth_info': testAuth, | |
| 'users': userIdNameMap | |
| } | |
| json.dump(metadata, outFile, indent=4) | |
| if not args.skipChannels: | |
| getChannels(slack, dryRun) | |
| if not args.skipPrivateChannels: | |
| getPrivateChannels(slack, dryRun) | |
| if not args.skipDirectMessages: | |
| getDirectMessages(slack, testAuth['user_id'], userIdNameMap, dryRun) |
It may not be pretty but I was able to throttle it enough to get all of our channels. The changes I made are in bold.
if not dryRun:
parentDir = "channels"
mkdir(parentDir)
for channel in channels:
print("getting history for channel {0}".format(channel['name']))
fileName = "{parent}/{file}.json".format(parent = parentDir, file = channel['name'])
time.sleep(30)
for retry in retryloop(10000, timeout=300):
I also realized why I didn't get an error when it by passed the larger channels. At the end of the "def retryloop" I saw that "raise RetryError" was commented out which resulted in channels getting skipped without me realizing it. I took the leading # off because I want to get the error. I suppose others might not, so to each their own.
We have 138 public channels, almost 300K public messages. Our general channel is over 10K messages. It definitely took a while to run, but the upside was Slack didn't stop the request as it apparently was sufficiently throttled.
I'm still learning, so if there is an easier/better way to throttle this, I'm all ears. In the meantime, I know this works.
@Benoit99, There's a simpler way to prevent the rate-limits from choking you out!
You'd need to add a small sleep() call to the script where it determines if it needs to fetch the next page or not.
Here's what the updated portion of my getHistory function looks like:
if (response['has_more'] == True):
lastTimestamp = messages[-1]['ts'] # -1 means last element in a list
print("Sleeping a second to avoid rate limits....")
sleep(2)
else:
break
return messages
I'd recommend updating this to note that tokens are now "legacy" but can still be generated at https://api.slack.com/custom-integrations/legacy-tokens
I've run this twice and it appears it is skipping some channels all together. I didn't notice at first, but I realized one of our biggest channels "general" wasn't in the resulting channels file with the other json files. I noticed a few other channels missing too. Is there something I need to change in the script? Thanks!