Skip to content

Instantly share code, notes, and snippets.

@AksAman
Created August 12, 2023 19:15
Show Gist options
  • Select an option

  • Save AksAman/ab1bee846bc4d2977d07f147061e40af to your computer and use it in GitHub Desktop.

Select an option

Save AksAman/ab1bee846bc4d2977d07f147061e40af to your computer and use it in GitHub Desktop.
Subtitle Parser for Youtube Captions
"""
Example srt:
---
1
00:00:00,660 --> 00:00:04,600
[Music]
2
00:00:05,040 --> 00:00:06,960
Lorem Ipsum is simply dummy text of the printing
3
00:00:06,960 --> 00:00:08,880
and typesetting industry. Lorem Ipsum has been the
4
00:00:08,880 --> 00:00:10,719
industry's standard dummy text ever since the 1500s,
5
00:00:10,719 --> 00:00:12,880
when an unknown printer took a galley of type and scrambled it to
6
00:00:12,880 --> 00:00:15,120
make a type specimen book. It has survived not only five centuries
7
00:00:15,120 --> 00:00:17,680
but also the leap into electronic typesetting
8
00:00:17,680 --> 00:00:19,600
remaining essentially unchanged
Output:
---
[
{
"number": 1,
"start_time": "00:00:00,660",
"end_time": "00:00:04,600",
"text": "[Music]",
"start_time_ms": 660,
"end_time_ms": 4600
},
{
"number": 2,
"start_time": "00:00:05,040",
"end_time": "00:00:06,960",
"text": "Lorem Ipsum is simply dummy text of the printing",
"start_time_ms": 5040,
"end_time_ms": 6960
},
{
"number": 3,
"start_time": "00:00:06,960",
"end_time": "00:00:08,880",
"text": "and typesetting industry. Lorem Ipsum has been the",
"start_time_ms": 6960,
"end_time_ms": 8880
},
{
"number": 4,
"start_time": "00:00:08,880",
"end_time": "00:00:10,719",
"text": "industry's standard dummy text ever since the 1500s,",
"start_time_ms": 8880,
"end_time_ms": 10719
},
{
"number": 5,
"start_time": "00:00:10,719",
"end_time": "00:00:12,880",
"text": "when an unknown printer took a galley of type and scrambled it to",
"start_time_ms": 10719,
"end_time_ms": 12880
},
{
"number": 6,
"start_time": "00:00:12,880",
"end_time": "00:00:15,120",
"text": "make a type specimen book. It has survived not only five centuries",
"start_time_ms": 12880,
"end_time_ms": 15120
},
{
"number": 7,
"start_time": "00:00:15,120",
"end_time": "00:00:17,680",
"text": "but also the leap into electronic typesetting",
"start_time_ms": 15120,
"end_time_ms": 17680
},
{
"number": 8,
"start_time": "00:00:17,680",
"end_time": "00:00:19,600",
"text": "remaining essentially unchanged",
"start_time_ms": 17680,
"end_time_ms": 19600
}
]
"""
import json
import re
from pathlib import Path
def time_to_ms(timecode: str):
hours, minutes, seconds, milliseconds = map(int, re.split(r"[:,]", timecode))
total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
return total_ms
def parse_subtitles(subtitle_text):
subtitles = []
subtitle_blocks = subtitle_text.strip().split("\n\n")
for block in subtitle_blocks:
lines = block.strip().split("\n")
if len(lines) >= 3:
subtitle_number = int(lines[0])
timecode_pattern = (
r"(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})"
)
timecode_match = re.search(timecode_pattern, lines[1])
if timecode_match:
start_time = timecode_match.group(1)
end_time = timecode_match.group(2)
subtitle_text = "\n".join(lines[2:])
subtitle = {
"number": subtitle_number,
"start_time": start_time,
"end_time": end_time,
"text": subtitle_text,
"start_time_ms": time_to_ms(start_time),
"end_time_ms": time_to_ms(end_time),
}
subtitles.append(subtitle)
return subtitles
def main():
filename = Path("path to your subtitle path")
try:
with open(filename, "r") as file:
subtitle_text = file.read()
subtitles = parse_subtitles(subtitle_text)
with open("path to output file.json", "w") as f:
json.dump(subtitles, f, indent=4)
# for subtitle in subtitles:
# print(subtitle)
except FileNotFoundError:
print("File not found.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment