Skip to content

Instantly share code, notes, and snippets.

@WandererGuy
Last active April 27, 2025 07:14
Show Gist options
  • Select an option

  • Save WandererGuy/f0b067df356dbe2dc3e6efbddd1562b3 to your computer and use it in GitHub Desktop.

Select an option

Save WandererGuy/f0b067df356dbe2dc3e6efbddd1562b3 to your computer and use it in GitHub Desktop.
"""
given 2 folder of FLEURS dataset, (example: en, vn downloaded and extract )
https://huggingface.co/datasets/google/fleurs/tree/main/data
__en
____train
____dev
____test
__vi
____train
____dev
____test
make ./SRC_AUDIO, ./TGT_AUDIO for training Speech to discrete unit speech translation
you only need to change
- language_ls
- mapping
"""
import os
import shutil
language_ls = ["en", "vi"]
mapping = {
"en": "./SRC_AUDIO",
"vi": "./TGT_AUDIO"
}
for key, value in mapping.items():
os.makedirs(value, exist_ok=True)
"""
we need wav file in SRC and TGT folder have same name (which is wav id) so:
rename wav file to its wav id
create processed train, dev, test folder , put it inside {language} folder
"""
for language in language_ls:
for dataset_type in ["train", "dev", "test"]:
train_folder = os.path.join(language,dataset_type,dataset_type)
train_tsv = os.path.join(language,f"{dataset_type}.tsv")
t = {}
with open(train_tsv, "r") as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line == "":
continue
line = line.split("\t")
wav_id = line[0]
wav_name = line[1]
t[wav_name] = wav_id
processed_train_folder = os.path.join(language,f"{dataset_type}_processed")
os.makedirs(processed_train_folder, exist_ok=True)
for wav_name, wav_id in t.items():
wav_filepath = os.path.join(train_folder, wav_name)
dest_filepath = os.path.join(processed_train_folder, wav_id + ".wav")
if not os.path.exists(dest_filepath):
shutil.copy(wav_filepath, dest_filepath)
"""
move processed train, dev, test folder to SRC_AUDIO, TGT_AUDIO and rename those folder
"""
for language in language_ls:
for dataset_type in ["train", "dev", "test"]:
src = os.path.join(language,f"{dataset_type}_processed")
dest_folder = mapping[language]
shutil.move(src, dest_folder)
os.rename(os.path.join(dest_folder, f"{dataset_type}_processed"), os.path.join(dest_folder, dataset_type))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment