Last active
March 20, 2022 21:29
-
-
Save pineapplemachine/77e64c92aa75757e235dd4f887e6a4ef to your computer and use it in GitHub Desktop.
Batch speech-to-text Python script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| This script can be used to perform automated speech-to-text on every | |
| *.ogg audio file in a directory, recursively. | |
| This was written in order to help better document the items in the | |
| `sounds` directory represented in Elden Ring's Data0.bhd archive file. | |
| Here were the steps taken: | |
| 1. Extract files including sounds/pck/normal.pck from Data0.bhd in | |
| Elden Ring's Game/ directory using ER.BDT.Tool: | |
| https://github.com/Ekey/ER.BDT.Tool | |
| 2. Extract OGG audio files from sounds/pck/normal.pck using | |
| this unpacker tool: | |
| https://github.com/Vextil/Wwise-Unpacker | |
| 3. Run this script using the directory containing the unpacked audio | |
| files as input. | |
| """ | |
| import os | |
| import subprocess | |
| import sys | |
| import speech_recognition | |
| class VTTBatch: | |
| def __init__(self, output_path, input_path): | |
| self.output_path = output_path | |
| self.input_path = input_path | |
| self.speech_data = dict() | |
| self.load_speech_data() | |
| self.recognizer = speech_recognition.Recognizer() | |
| self.handled_count = 0 | |
| def load_speech_data(self): | |
| if not os.path.exists(self.output_path): | |
| return | |
| print("Reading existing data from", self.output_path) | |
| with open(self.output_path, "rt", encoding="utf-8") as data_file: | |
| for line in data_file.readlines(): | |
| if line: | |
| name, text = line.split("\t", 1) | |
| self.speech_data[name] = text | |
| def write_speech_data(self): | |
| print("Writing data to", self.output_path) | |
| with open(self.output_path, "wt", encoding="utf-8") as data_file: | |
| for name, text in self.speech_data.items(): | |
| data_file.write("%s\t%s\n" % ( | |
| name, text.strip().replace("\t", " ").replace("\n", " ") | |
| )) | |
| def iter_input_files(self): | |
| for root, dirs, files in os.walk(self.input_path): | |
| for file_name in files: | |
| if not file_name.endswith(".ogg"): | |
| continue | |
| file_path = os.path.join(root, file_name) | |
| # Ignore small files | |
| if os.path.getsize(file_path) < 10000: | |
| continue | |
| yield self.handle_input_file(file_path) | |
| def handle_input_file(self, file_path): | |
| # Check for existing data | |
| name = os.path.relpath(file_path, self.input_path) | |
| if name in self.speech_data: | |
| return (False, name, self.speech_data[name]) | |
| # Convert to WAV | |
| file_path_wav = file_path + ".wav" | |
| if os.path.exists(file_path_wav): | |
| print("File already exists: %s" % file_path_wav) | |
| ffmpeg_process = subprocess.Popen(["ffmpeg", "-loglevel", "error", "-i", file_path, file_path_wav]) | |
| ffmpeg_process.wait() | |
| # Speech to text | |
| text = "" | |
| try: | |
| with speech_recognition.AudioFile(file_path_wav) as audio_source: | |
| audio = self.recognizer.record(audio_source) | |
| text = self.recognizer.recognize_sphinx(audio) | |
| except speech_recognition.UnknownValueError: | |
| text = "" | |
| except (speech_recognition.RequestError, ValueError) as error: | |
| print("Sphinx error: {0}".format(error)) | |
| text = "[Error]" | |
| # Clean up WAV file | |
| os.remove(file_path_wav) | |
| # All done | |
| self.speech_data[name] = text | |
| self.handled_count += 1 | |
| return (True, name, text) | |
| def __main__(): | |
| if len(sys.argv) != 3: | |
| print("Usage: python elden-vttbatch.py [output-file] [directory-path]") | |
| sys.exit(0) | |
| output_path = sys.argv[1] | |
| input_path = sys.argv[2] | |
| vtt_batch = VTTBatch(output_path, input_path) | |
| try: | |
| for new_file, name, text in vtt_batch.iter_input_files(): | |
| if new_file: | |
| print(name, text) | |
| if new_file and vtt_batch.handled_count % 200 == 0: | |
| vtt_batch.write_speech_data() | |
| except: | |
| if vtt_batch.handled_count: | |
| vtt_batch.write_speech_data() | |
| raise | |
| vtt_batch.write_speech_data() | |
| if __name__ == "__main__": | |
| __main__() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment