# 个人资料 - 偏好设置 - 请求归档
# 下载 ZIP,解压 `user_archive.csv`
python3 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
python main.py >1.sh
# 进入浏览器开发者工具,复制 Cookie _t 值
# 打开 1.sh,将 Cookie _t 值写入 COOKIE_T 变量
bash 1.sh
# 成功下载的文件都在 output 里
# 下载失败的文件名在 missing 里,表示帖子已被删除
Last active
January 24, 2026 23:51
-
-
Save yoursunny/e2cc67d5931272de17799bf5407dd85d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| __pycache__/ | |
| *.py[codz] | |
| *$py.class | |
| /.venv | |
| /*.csv | |
| /*.sh | |
| /missing | |
| /output |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import re | |
| from pathlib import Path | |
| from typing import Literal, TypedDict | |
| from bs4 import BeautifulSoup | |
| SERVER = "https://shuiyuan.sjtu.edu.cn" | |
| class Record(TypedDict): | |
| topic_title: str | |
| categories: str | |
| is_pm: Literal["否", "是"] | |
| post_raw: str | |
| post_cooked: str | |
| like_count: int | |
| reply_count: int | |
| url: str | |
| created_at: str | |
| re_upload = re.compile(r'upload://[^\s\"\)]+') | |
| def process_row(row: Record) -> None: | |
| if row["is_pm"] == "是": | |
| return | |
| cooked_bs = [] | |
| for m in re_upload.finditer(row["post_raw"]): | |
| process_upload(m[0][9:], row["post_cooked"], cooked_bs) | |
| def process_upload(filename: str, cooked: str, cooked_bs: list[BeautifulSoup]) -> None: | |
| basename = Path(filename).stem | |
| short_url = f"/uploads/short-url/{basename}.jpeg?dl=1" | |
| if short_url in cooked: | |
| url = short_url | |
| else: | |
| if len(cooked_bs) == 0: | |
| cooked_bs.append(BeautifulSoup(cooked, features="html.parser")) | |
| soup = cooked_bs[0] | |
| found = soup.find("img", {"data-base62-sha1": basename}) | |
| if found is None: | |
| return | |
| url = found["src"] | |
| if url.startswith(SERVER): | |
| url = url[len(SERVER):] | |
| print(f"download {filename} {url}") | |
| def main(filename: str) -> None: | |
| print("#!/bin/bash") | |
| print("set -euo pipefail") | |
| print("COOKIE_T=") | |
| print("mkdir -p output/ missing/") | |
| print("download() {") | |
| print(" if [[ -f output/$1 ]] || [[ -f missing/$1 ]]; then") | |
| print(" return") | |
| print(" fi") | |
| print( | |
| f" if ! http -F -d -o output/$1 GET {SERVER}$2 Cookie:_t=$COOKIE_T; then") | |
| print(" mv output/$1 missing/$1") | |
| print(" fi") | |
| print("}") | |
| print("") | |
| with open(filename, newline="") as csvfile: | |
| reader = csv.DictReader(csvfile) | |
| for row in reader: | |
| process_row(row) | |
| if __name__ == "__main__": | |
| main("user_archive.csv") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| beautifulsoup4>=4.14.3,<5 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment