AksAman · August 12, 2023 19:15
diff --git a/youtube_subtitle_parser.py b/youtube_subtitle_parser.py
 """
 Example srt:
 ---
 1
 00:00:00,660 --> 00:00:04,600
 [Music]

 2
 00:00:05,040 --> 00:00:06,960
 Lorem Ipsum is simply dummy text of the printing

 3
 00:00:06,960 --> 00:00:08,880
 and typesetting industry. Lorem Ipsum has been the

 4
 00:00:08,880 --> 00:00:10,719
 industry's standard dummy text ever since the 1500s, 

 5
 00:00:10,719 --> 00:00:12,880
 when an unknown printer took a galley of type and scrambled it to

 6
 00:00:12,880 --> 00:00:15,120
 make a type specimen book. It has survived not only five centuries

 7
 00:00:15,120 --> 00:00:17,680
 but also the leap into electronic typesetting

 8
 00:00:17,680 --> 00:00:19,600
 remaining essentially unchanged

 Output:
 ---
 [
    {
        "number": 1,
        "start_time": "00:00:00,660",
        "end_time": "00:00:04,600",
        "text": "[Music]",
        "start_time_ms": 660,
        "end_time_ms": 4600
    },
    {
        "number": 2,
        "start_time": "00:00:05,040",
        "end_time": "00:00:06,960",
        "text": "Lorem Ipsum is simply dummy text of the printing",
        "start_time_ms": 5040,
        "end_time_ms": 6960
    },
    {
        "number": 3,
        "start_time": "00:00:06,960",
        "end_time": "00:00:08,880",
        "text": "and typesetting industry. Lorem Ipsum has been the",
        "start_time_ms": 6960,
        "end_time_ms": 8880
    },
    {
        "number": 4,
        "start_time": "00:00:08,880",
        "end_time": "00:00:10,719",
        "text": "industry's standard dummy text ever since the 1500s,",
        "start_time_ms": 8880,
        "end_time_ms": 10719
    },
    {
        "number": 5,
        "start_time": "00:00:10,719",
        "end_time": "00:00:12,880",
        "text": "when an unknown printer took a galley of type and scrambled it to",
        "start_time_ms": 10719,
        "end_time_ms": 12880
    },
    {
        "number": 6,
        "start_time": "00:00:12,880",
        "end_time": "00:00:15,120",
        "text": "make a type specimen book. It has survived not only five centuries",
        "start_time_ms": 12880,
        "end_time_ms": 15120
    },
    {
        "number": 7,
        "start_time": "00:00:15,120",
        "end_time": "00:00:17,680",
        "text": "but also the leap into electronic typesetting",
        "start_time_ms": 15120,
        "end_time_ms": 17680
    },
    {
        "number": 8,
        "start_time": "00:00:17,680",
        "end_time": "00:00:19,600",
        "text": "remaining essentially unchanged",
        "start_time_ms": 17680,
        "end_time_ms": 19600
    }
 ]
 """




 import json
 import re
 from pathlib import Path


 def time_to_ms(timecode: str):
    hours, minutes, seconds, milliseconds = map(int, re.split(r"[:,]", timecode))
    total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
    return total_ms


 def parse_subtitles(subtitle_text):
    subtitles = []

    subtitle_blocks = subtitle_text.strip().split("\n\n")

    for block in subtitle_blocks:
        lines = block.strip().split("\n")

        if len(lines) >= 3:
            subtitle_number = int(lines[0])

            timecode_pattern = (
                r"(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})"
            )
            timecode_match = re.search(timecode_pattern, lines[1])

            if timecode_match:
                start_time = timecode_match.group(1)
                end_time = timecode_match.group(2)

                subtitle_text = "\n".join(lines[2:])

                subtitle = {
                    "number": subtitle_number,
                    "start_time": start_time,
                    "end_time": end_time,
                    "text": subtitle_text,
                    "start_time_ms": time_to_ms(start_time),
                    "end_time_ms": time_to_ms(end_time),
                }

                subtitles.append(subtitle)

    return subtitles


 def main():
    filename = Path("path to your subtitle path")
    try:
        with open(filename, "r") as file:
            subtitle_text = file.read()
            subtitles = parse_subtitles(subtitle_text)
            with open("path to output file.json", "w") as f:
                json.dump(subtitles, f, indent=4)
            # for subtitle in subtitles:
            #     print(subtitle)
    except FileNotFoundError:
        print("File not found.")


 if __name__ == "__main__":
    main()
	"""
	Example srt:
	---
	1
	00:00:00,660 --> 00:00:04,600
	[Music]

	2
	00:00:05,040 --> 00:00:06,960
	Lorem Ipsum is simply dummy text of the printing

	3
	00:00:06,960 --> 00:00:08,880
	and typesetting industry. Lorem Ipsum has been the

	4
	00:00:08,880 --> 00:00:10,719
	industry's standard dummy text ever since the 1500s,

	5
	00:00:10,719 --> 00:00:12,880
	when an unknown printer took a galley of type and scrambled it to

	6
	00:00:12,880 --> 00:00:15,120
	make a type specimen book. It has survived not only five centuries

	7
	00:00:15,120 --> 00:00:17,680
	but also the leap into electronic typesetting

	8
	00:00:17,680 --> 00:00:19,600
	remaining essentially unchanged

	Output:
	---
	[
	{
	"number": 1,
	"start_time": "00:00:00,660",
	"end_time": "00:00:04,600",
	"text": "[Music]",
	"start_time_ms": 660,
	"end_time_ms": 4600
	},
	{
	"number": 2,
	"start_time": "00:00:05,040",
	"end_time": "00:00:06,960",
	"text": "Lorem Ipsum is simply dummy text of the printing",
	"start_time_ms": 5040,
	"end_time_ms": 6960
	},
	{
	"number": 3,
	"start_time": "00:00:06,960",
	"end_time": "00:00:08,880",
	"text": "and typesetting industry. Lorem Ipsum has been the",
	"start_time_ms": 6960,
	"end_time_ms": 8880
	},
	{
	"number": 4,
	"start_time": "00:00:08,880",
	"end_time": "00:00:10,719",
	"text": "industry's standard dummy text ever since the 1500s,",
	"start_time_ms": 8880,
	"end_time_ms": 10719
	},
	{
	"number": 5,
	"start_time": "00:00:10,719",
	"end_time": "00:00:12,880",
	"text": "when an unknown printer took a galley of type and scrambled it to",
	"start_time_ms": 10719,
	"end_time_ms": 12880
	},
	{
	"number": 6,
	"start_time": "00:00:12,880",
	"end_time": "00:00:15,120",
	"text": "make a type specimen book. It has survived not only five centuries",
	"start_time_ms": 12880,
	"end_time_ms": 15120
	},
	{
	"number": 7,
	"start_time": "00:00:15,120",
	"end_time": "00:00:17,680",
	"text": "but also the leap into electronic typesetting",
	"start_time_ms": 15120,
	"end_time_ms": 17680
	},
	{
	"number": 8,
	"start_time": "00:00:17,680",
	"end_time": "00:00:19,600",
	"text": "remaining essentially unchanged",
	"start_time_ms": 17680,
	"end_time_ms": 19600
	}
	]
	"""




	import json
	import re
	from pathlib import Path


	def time_to_ms(timecode: str):
	hours, minutes, seconds, milliseconds = map(int, re.split(r"[:,]", timecode))
	total_ms = (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
	return total_ms


	def parse_subtitles(subtitle_text):
	subtitles = []

	subtitle_blocks = subtitle_text.strip().split("\n\n")

	for block in subtitle_blocks:
	lines = block.strip().split("\n")

	if len(lines) >= 3:
	subtitle_number = int(lines[0])

	timecode_pattern = (
	r"(\d{2}:\d{2}:\d{2},\d{3})\s-->\s(\d{2}:\d{2}:\d{2},\d{3})"
	)
	timecode_match = re.search(timecode_pattern, lines[1])

	if timecode_match:
	start_time = timecode_match.group(1)
	end_time = timecode_match.group(2)

	subtitle_text = "\n".join(lines[2:])

	subtitle = {
	"number": subtitle_number,
	"start_time": start_time,
	"end_time": end_time,
	"text": subtitle_text,
	"start_time_ms": time_to_ms(start_time),
	"end_time_ms": time_to_ms(end_time),
	}

	subtitles.append(subtitle)

	return subtitles


	def main():
	filename = Path("path to your subtitle path")
	try:
	with open(filename, "r") as file:
	subtitle_text = file.read()
	subtitles = parse_subtitles(subtitle_text)
	with open("path to output file.json", "w") as f:
	json.dump(subtitles, f, indent=4)
	# for subtitle in subtitles:
	# print(subtitle)
	except FileNotFoundError:
	print("File not found.")


	if __name__ == "__main__":
	main()
No results found