Created
November 7, 2025 18:48
-
-
Save abdusco/118a6a3ab41a0a1d2a5f8813f7895ca1 to your computer and use it in GitHub Desktop.
Transcribe YouTube videos
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env -S python3 | |
| from pathlib import Path | |
| import subprocess | |
| import json | |
| import sys | |
| import tempfile | |
| import argparse | |
| import logging | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Extract transcript from YouTube video", formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| parser.add_argument("url", help="YouTube video URL") | |
| return parser.parse_args() | |
| def main() -> None: | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") | |
| args = parse_args() | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| temp_dir = Path(tmpdir) | |
| # Run yt-dlp | |
| subprocess.run( | |
| [ | |
| "yt-dlp", | |
| "--skip-download", | |
| "--write-auto-subs", | |
| "--sub-format", | |
| "json3", | |
| args.url, | |
| "-o", | |
| "%(title)s", | |
| ], | |
| cwd=temp_dir, | |
| stdout=subprocess.DEVNULL, | |
| stderr=subprocess.DEVNULL, | |
| check=True, | |
| ) | |
| # Find and parse json3 file | |
| json_files = list(temp_dir.glob("*.json3")) | |
| if not json_files: | |
| logging.error("No transcript found") | |
| sys.exit(1) | |
| with json_files[0].open() as f: | |
| data = json.load(f) | |
| for event in data.get("events", []): | |
| if "segs" in event: | |
| for seg in event["segs"]: | |
| if "utf8" in seg: | |
| print(seg["utf8"], end="") | |
| print() | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment