-
Star
(107)
You must be signed in to star a gist -
Fork
(32)
You must be signed in to fork a gist
-
-
Save glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e to your computer and use it in GitHub Desktop.
| """ | |
| Convert YouTube subtitles(vtt) to human readable text. | |
| Download only subtitles from YouTube with youtube-dl: | |
| youtube-dl --skip-download --convert-subs vtt <video_url> | |
| Note that default subtitle format provided by YouTube is ass, which is hard | |
| to process with simple regex. Luckily youtube-dl can convert ass to vtt, which | |
| is easier to process. | |
| To conver all vtt files inside a directory: | |
| find . -name "*.vtt" -exec python vtt2text.py {} \; | |
| """ | |
| import sys | |
| import re | |
| def remove_tags(text): | |
| """ | |
| Remove vtt markup tags | |
| """ | |
| tags = [ | |
| r'</c>', | |
| r'<c(\.color\w+)?>', | |
| r'<\d{2}:\d{2}:\d{2}\.\d{3}>', | |
| ] | |
| for pat in tags: | |
| text = re.sub(pat, '', text) | |
| # extract timestamp, only kep HH:MM | |
| text = re.sub( | |
| r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%', | |
| r'\g<1>', | |
| text | |
| ) | |
| text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE) | |
| return text | |
| def remove_header(lines): | |
| """ | |
| Remove vtt file header | |
| """ | |
| pos = -1 | |
| for mark in ('##', 'Language: en',): | |
| if mark in lines: | |
| pos = lines.index(mark) | |
| lines = lines[pos+1:] | |
| return lines | |
| def merge_duplicates(lines): | |
| """ | |
| Remove duplicated subtitles. Duplacates are always adjacent. | |
| """ | |
| last_timestamp = '' | |
| last_cap = '' | |
| for line in lines: | |
| if line == "": | |
| continue | |
| if re.match('^\d{2}:\d{2}$', line): | |
| if line != last_timestamp: | |
| yield line | |
| last_timestamp = line | |
| else: | |
| if line != last_cap: | |
| yield line | |
| last_cap = line | |
| def merge_short_lines(lines): | |
| buffer = '' | |
| for line in lines: | |
| if line == "" or re.match('^\d{2}:\d{2}$', line): | |
| yield '\n' + line | |
| continue | |
| if len(line+buffer) < 80: | |
| buffer += ' ' + line | |
| else: | |
| yield buffer.strip() | |
| buffer = line | |
| yield buffer | |
| def main(): | |
| vtt_file_name = sys.argv[1] | |
| txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name) | |
| with open(vtt_file_name) as f: | |
| text = f.read() | |
| text = remove_tags(text) | |
| lines = text.splitlines() | |
| lines = remove_header(lines) | |
| lines = merge_duplicates(lines) | |
| lines = list(lines) | |
| lines = merge_short_lines(lines) | |
| lines = list(lines) | |
| with open(txt_name, 'w') as f: | |
| for line in lines: | |
| f.write(line) | |
| f.write("\n") | |
| if __name__ == "__main__": | |
| main() |
Thanks a lot for the script @glasslion.
Just found out this script after I made this one:
https://gist.github.com/arturmartins/1c78de3e8c21ffce81a17dc2f2181de4
Might be of help to some.
Would a command-line tool with interface below be welcome?
yt-text bZ6pA--F3D4 > subtitles.txt
or better with full URL?
yt-text https://youtu.be/bZ6pA--F3D4 > subtitles.txt
Would a command-line tool with interface below be welcome?
yt-text bZ6pA--F3D4 > subtitles.txt
or better with full URL?
yt-text https://youtu.be/bZ6pA--F3D4 > subtitles.txt
Yes, it would be 😁
EDIT: For anyone interested, https://gist.github.com/epogrebnyak/ba87ba52f779f7ebd93b04b2af1059aa
Hi everyone, wrapped this script here: https://github.com/epogrebnyak/justsubs
Sample usage:
from justsubs import Video
subs = Video("KzWS7gJX5Z8").subtitles(language="en-uYU-mmqFLq8")
subs.download()
print(subs.get_text_blocks()[:10])
print(subs.get_plain_text()[:550])It seems simply "en" does not work, need "en-uYU-mmqFLq8".
Also pip install justsubs should work
For YouTube subtitles, there were some timestamps and metadata remaining while using the script.
I've fixed it here:
https://gist.github.com/florentroques/c08bbe54fba42ec56c9d48229ed9c49b
if you want to join me on a Stream we can walk though it and record podcast/video for HackerPublicRadio.org ! just hit me up sometime freeload01____yahoo.com