Created
August 12, 2023 19:15
-
-
Save AksAman/ab1bee846bc4d2977d07f147061e40af to your computer and use it in GitHub Desktop.
Subtitle Parser for Youtube Captions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Example srt: | |
| --- | |
| 1 | |
| 00:00:00,660 --> 00:00:04,600 | |
| [Music] | |
| 2 | |
| 00:00:05,040 --> 00:00:06,960 | |
| Lorem Ipsum is simply dummy text of the printing | |
| 3 | |
| 00:00:06,960 --> 00:00:08,880 | |
| and typesetting industry. Lorem Ipsum has been the | |
| 4 | |
| 00:00:08,880 --> 00:00:10,719 | |
| industry's standard dummy text ever since the 1500s, | |
| 5 | |
| 00:00:10,719 --> 00:00:12,880 | |
| when an unknown printer took a galley of type and scrambled it to | |
| 6 | |
| 00:00:12,880 --> 00:00:15,120 | |
| make a type specimen book. It has survived not only five centuries | |
| 7 | |
| 00:00:15,120 --> 00:00:17,680 | |
| but also the leap into electronic typesetting | |
| 8 | |
| 00:00:17,680 --> 00:00:19,600 | |
| remaining essentially unchanged | |
| Output: | |
| --- | |
| [ | |
| { | |
| "number": 1, | |
| "start_time": "00:00:00,660", | |
| "end_time": "00:00:04,600", | |
| "text": "[Music]", | |
| "start_time_ms": 660, | |
| "end_time_ms": 4600 | |
| }, | |
| { | |
| "number": 2, | |
| "start_time": "00:00:05,040", | |
| "end_time": "00:00:06,960", | |
| "text": "Lorem Ipsum is simply dummy text of the printing", | |
| "start_time_ms": 5040, | |
| "end_time_ms": 6960 | |
| }, | |
| { | |
| "number": 3, | |
| "start_time": "00:00:06,960", | |
| "end_time": "00:00:08,880", | |
| "text": "and typesetting industry. Lorem Ipsum has been the", | |
| "start_time_ms": 6960, | |
| "end_time_ms": 8880 | |
| }, | |
| { | |
| "number": 4, | |
| "start_time": "00:00:08,880", | |
| "end_time": "00:00:10,719", | |
| "text": "industry's standard dummy text ever since the 1500s,", | |
| "start_time_ms": 8880, | |
| "end_time_ms": 10719 | |
| }, | |
| { | |
| "number": 5, | |
| "start_time": "00:00:10,719", | |
| "end_time": "00:00:12,880", | |
| "text": "when an unknown printer took a galley of type and scrambled it to", | |
| "start_time_ms": 10719, | |
| "end_time_ms": 12880 | |
| }, | |
| { | |
| "number": 6, | |
| "start_time": "00:00:12,880", | |
| "end_time": "00:00:15,120", | |
| "text": "make a type specimen book. It has survived not only five centuries", | |
| "start_time_ms": 12880, | |
| "end_time_ms": 15120 | |
| }, | |
| { | |
| "number": 7, | |
| "start_time": "00:00:15,120", | |
| "end_time": "00:00:17,680", | |
| "text": "but also the leap into electronic typesetting", | |
| "start_time_ms": 15120, | |
| "end_time_ms": 17680 | |
| }, | |
| { | |
| "number": 8, | |
| "start_time": "00:00:17,680", | |
| "end_time": "00:00:19,600", | |
| "text": "remaining essentially unchanged", | |
| "start_time_ms": 17680, | |
| "end_time_ms": 19600 | |
| } | |
| ] | |
| """ | |
| import json | |
| import re | |
| from pathlib import Path | |
| def time_to_ms(timecode: str): | |
| hours, minutes, seconds, milliseconds = map(int, re.split(r"[:,]", timecode)) | |
| total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds | |
| return total_ms | |
| def parse_subtitles(subtitle_text): | |
| subtitles = [] | |
| subtitle_blocks = subtitle_text.strip().split("\n\n") | |
| for block in subtitle_blocks: | |
| lines = block.strip().split("\n") | |
| if len(lines) >= 3: | |
| subtitle_number = int(lines[0]) | |
| timecode_pattern = ( | |
| r"(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})" | |
| ) | |
| timecode_match = re.search(timecode_pattern, lines[1]) | |
| if timecode_match: | |
| start_time = timecode_match.group(1) | |
| end_time = timecode_match.group(2) | |
| subtitle_text = "\n".join(lines[2:]) | |
| subtitle = { | |
| "number": subtitle_number, | |
| "start_time": start_time, | |
| "end_time": end_time, | |
| "text": subtitle_text, | |
| "start_time_ms": time_to_ms(start_time), | |
| "end_time_ms": time_to_ms(end_time), | |
| } | |
| subtitles.append(subtitle) | |
| return subtitles | |
| def main(): | |
| filename = Path("path to your subtitle path") | |
| try: | |
| with open(filename, "r") as file: | |
| subtitle_text = file.read() | |
| subtitles = parse_subtitles(subtitle_text) | |
| with open("path to output file.json", "w") as f: | |
| json.dump(subtitles, f, indent=4) | |
| # for subtitle in subtitles: | |
| # print(subtitle) | |
| except FileNotFoundError: | |
| print("File not found.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment