Skip to content

Instantly share code, notes, and snippets.

@pineapplemachine
Last active March 20, 2022 21:29
Show Gist options
  • Select an option

  • Save pineapplemachine/77e64c92aa75757e235dd4f887e6a4ef to your computer and use it in GitHub Desktop.

Select an option

Save pineapplemachine/77e64c92aa75757e235dd4f887e6a4ef to your computer and use it in GitHub Desktop.
Batch speech-to-text Python script
"""
This script can be used to perform automated speech-to-text on every
*.ogg audio file in a directory, recursively.
This was written in order to help better document the items in the
`sounds` directory represented in Elden Ring's Data0.bhd archive file.
Here were the steps taken:
1. Extract files including sounds/pck/normal.pck from Data0.bhd in
Elden Ring's Game/ directory using ER.BDT.Tool:
https://github.com/Ekey/ER.BDT.Tool
2. Extract OGG audio files from sounds/pck/normal.pck using
this unpacker tool:
https://github.com/Vextil/Wwise-Unpacker
3. Run this script using the directory containing the unpacked audio
files as input.
"""
import os
import subprocess
import sys
import speech_recognition
class VTTBatch:
def __init__(self, output_path, input_path):
self.output_path = output_path
self.input_path = input_path
self.speech_data = dict()
self.load_speech_data()
self.recognizer = speech_recognition.Recognizer()
self.handled_count = 0
def load_speech_data(self):
if not os.path.exists(self.output_path):
return
print("Reading existing data from", self.output_path)
with open(self.output_path, "rt", encoding="utf-8") as data_file:
for line in data_file.readlines():
if line:
name, text = line.split("\t", 1)
self.speech_data[name] = text
def write_speech_data(self):
print("Writing data to", self.output_path)
with open(self.output_path, "wt", encoding="utf-8") as data_file:
for name, text in self.speech_data.items():
data_file.write("%s\t%s\n" % (
name, text.strip().replace("\t", " ").replace("\n", " ")
))
def iter_input_files(self):
for root, dirs, files in os.walk(self.input_path):
for file_name in files:
if not file_name.endswith(".ogg"):
continue
file_path = os.path.join(root, file_name)
# Ignore small files
if os.path.getsize(file_path) < 10000:
continue
yield self.handle_input_file(file_path)
def handle_input_file(self, file_path):
# Check for existing data
name = os.path.relpath(file_path, self.input_path)
if name in self.speech_data:
return (False, name, self.speech_data[name])
# Convert to WAV
file_path_wav = file_path + ".wav"
if os.path.exists(file_path_wav):
print("File already exists: %s" % file_path_wav)
ffmpeg_process = subprocess.Popen(["ffmpeg", "-loglevel", "error", "-i", file_path, file_path_wav])
ffmpeg_process.wait()
# Speech to text
text = ""
try:
with speech_recognition.AudioFile(file_path_wav) as audio_source:
audio = self.recognizer.record(audio_source)
text = self.recognizer.recognize_sphinx(audio)
except speech_recognition.UnknownValueError:
text = ""
except (speech_recognition.RequestError, ValueError) as error:
print("Sphinx error: {0}".format(error))
text = "[Error]"
# Clean up WAV file
os.remove(file_path_wav)
# All done
self.speech_data[name] = text
self.handled_count += 1
return (True, name, text)
def __main__():
if len(sys.argv) != 3:
print("Usage: python elden-vttbatch.py [output-file] [directory-path]")
sys.exit(0)
output_path = sys.argv[1]
input_path = sys.argv[2]
vtt_batch = VTTBatch(output_path, input_path)
try:
for new_file, name, text in vtt_batch.iter_input_files():
if new_file:
print(name, text)
if new_file and vtt_batch.handled_count % 200 == 0:
vtt_batch.write_speech_data()
except:
if vtt_batch.handled_count:
vtt_batch.write_speech_data()
raise
vtt_batch.write_speech_data()
if __name__ == "__main__":
__main__()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment