Skip to content

Instantly share code, notes, and snippets.

@SansPapyrus683
Last active December 8, 2025 20:44
Show Gist options
  • Select an option

  • Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.

Select an option

Save SansPapyrus683/c551e6b48d6cefdf9e8345720fc2e098 to your computer and use it in GitHub Desktop.
download all your twitter anime girls!
import os
from collections import defaultdict
path = os.path.expanduser("~/OneDrive/Pictures/twitter")
os.chdir(path)
groups = defaultdict(list)
for i in os.listdir():
split = i.find("_")
author = i[:split]
id_ = i[split : i.find("_", split + 1)]
groups[author].append((id_, i))
tot = 0
for imgs in groups.values():
to_delete = []
seen = {}
imgs.sort()
for id_, i in imgs:
data = open(i, "rb").read()
if data not in seen:
seen[data] = [i, id_]
else:
print(f"deleting {i} in favor of {seen[data][0]}")
to_delete.append(i)
if to_delete:
print(to_delete)
tot += len(to_delete)
for i in to_delete:
os.remove(i)
import json
import os
import re
import shutil
import sys
from datetime import datetime, timedelta
from email import utils
import requests
def extract_id(tweet: str) -> int:
tweet = os.path.splitext(tweet)[0]
first = tweet.rfind("_")
second = tweet[:first].rfind("_")
return int(tweet[second + 1 : first])
def load_twt_obj(file: str) -> list:
raw = open(file, encoding="utf8").read()
return json.loads(raw[raw.find("=") + 1 :])
# also add deleted-tweets.js if your'e like that
tweets = load_twt_obj("data/tweets.js")
del_dir = "data/deleted_tweets_media"
gen_dir = "data/tweets_media"
for fn in os.listdir(del_dir):
shutil.copy(os.path.join(del_dir, fn), gen_dir)
have_alr = set()
for store in sys.argv[1:]:
for name in os.listdir(store):
have_alr.add(extract_id(name))
# after getting the actual images this isn't needed but just in case
all_raw_media = os.listdir(gen_dir)
all_media = {}
for i in all_raw_media:
post_id = i[: i.find("-")]
img_id = i[i.find("-") + 1 : i.rfind(".")]
_, ext = os.path.splitext(i)
if post_id not in all_media:
all_media[post_id] = {}
all_media[post_id][img_id] = ext
# sort them from oldest to newest
tweets.sort(key=lambda t: utils.parsedate_to_datetime(t["tweet"]["created_at"]))
handle_fmt = re.compile(r"RT @([^:]*):")
img_id_fmt = re.compile(r"http://pbs\.twimg\.com/media/([^\.*]*)\.")
os.makedirs("good_media", exist_ok=True)
all_paths = []
print(f"alright, a total of {len(tweets)} tweets to go through. let's go!")
for v, t in enumerate(tweets):
if (v + 1) % 100 == 0:
print(f"at tweet #{v + 1}")
t = t["tweet"]
match = handle_fmt.match(t["full_text"])
if match is None:
continue
handle = match.group(1)
og_id = t["id"]
if "media" not in t["entities"]:
continue
media = t["extended_entities"]["media"]
src_id = [m["source_status_id"] for m in media]
assert len(set(src_id)) == 1 # just a sanity check
src_id = int(src_id[0])
if src_id in have_alr:
continue
curr_paths = []
# quick hack to get videos to download
vid = all_media[og_id]
# most videos are standalone. there's one (1) tweet so far that violates this
if ".mp4" in vid.values() and len(vid) == 1:
vid_id = list(vid.keys())[0]
stupid_path = os.path.join(gen_dir, f"{og_id}-{vid_id}.mp4")
sigma_path = f"good_media/{handle}_{src_id}_1.mp4"
shutil.copy(stupid_path, sigma_path)
curr_paths.append(sigma_path)
for img_at, m in enumerate(media):
img_id = img_id_fmt.match(m["media_url"])
# sometimes you have things like ext_tw_video_thumb or tweet_video_thumb
if img_id is None:
continue
img_id = img_id.group(1)
if img_id not in all_media.get(og_id, []):
continue
ext = all_media[og_id][img_id]
sigma_path = f"good_media/{handle}_{src_id}_{img_at + 1}{ext}"
stupid_path = os.path.join(gen_dir, f"{og_id}-{img_id}{ext}")
dl_url = f"http://pbs.twimg.com/media/{img_id}{ext}:orig"
img_data = requests.get(dl_url).content
if not img_data:
shutil.copy(stupid_path, sigma_path)
else:
with open(sigma_path, "wb") as written:
written.write(img_data)
curr_paths.append(sigma_path)
all_paths.extend(reversed(curr_paths))
now = datetime.now()
epoch = datetime(1970, 1, 1)
for v, p in enumerate(reversed(all_paths)):
delta = (now - timedelta(seconds=2 * v) - epoch).total_seconds()
os.utime(p, times=(delta, delta))
import os
from collections import defaultdict
from datetime import datetime, timedelta
path = os.path.expanduser("~/OneDrive/Pictures/twitter")
os.chdir(path)
groups = defaultdict(list)
for i in os.listdir():
start = i.rfind("_") + 1
end = i.rfind(".")
num = int(i[start:end])
author_end = i.rfind("_", 0, start - 1)
author = i[:author_end]
id_ = int(i[author_end + 1:start - 1])
groups[id_].append((author, num, i))
all_times = []
now = datetime.now()
epoch = datetime(1970, 1, 1)
at = 0
to_fix = []
for id_, tweets in sorted(groups.items(), reverse=True):
tweets.sort(key=lambda t: t[1])
all_authors = {t[0].lower() for t in tweets}
all_nums = [t[1] for t in tweets]
assert len(all_authors) == 1
assert all_nums == list(range(1, len(all_nums) + 1))
for t in tweets:
delta = (now - timedelta(seconds=2 * at) - epoch).total_seconds()
os.utime(t[2], times=(delta, delta))
at += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment