Last active
April 27, 2025 07:14
-
-
Save WandererGuy/f0b067df356dbe2dc3e6efbddd1562b3 to your computer and use it in GitHub Desktop.
answer youtube comment https://www.youtube.com/watch?v=HIAt9kawqsQ&lc=UgyeTOQyQvOZGaEAa6N4AaABAg.A7sazpD7tDCAHPaVgmyN4q
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| given 2 folder of FLEURS dataset, (example: en, vn downloaded and extract ) | |
| https://huggingface.co/datasets/google/fleurs/tree/main/data | |
| __en | |
| ____train | |
| ____dev | |
| ____test | |
| __vi | |
| ____train | |
| ____dev | |
| ____test | |
| make ./SRC_AUDIO, ./TGT_AUDIO for training Speech to discrete unit speech translation | |
| you only need to change | |
| - language_ls | |
| - mapping | |
| """ | |
| import os | |
| import shutil | |
| language_ls = ["en", "vi"] | |
| mapping = { | |
| "en": "./SRC_AUDIO", | |
| "vi": "./TGT_AUDIO" | |
| } | |
| for key, value in mapping.items(): | |
| os.makedirs(value, exist_ok=True) | |
| """ | |
| we need wav file in SRC and TGT folder have same name (which is wav id) so: | |
| rename wav file to its wav id | |
| create processed train, dev, test folder , put it inside {language} folder | |
| """ | |
| for language in language_ls: | |
| for dataset_type in ["train", "dev", "test"]: | |
| train_folder = os.path.join(language,dataset_type,dataset_type) | |
| train_tsv = os.path.join(language,f"{dataset_type}.tsv") | |
| t = {} | |
| with open(train_tsv, "r") as f: | |
| lines = f.readlines() | |
| for line in lines: | |
| line = line.strip() | |
| if line == "": | |
| continue | |
| line = line.split("\t") | |
| wav_id = line[0] | |
| wav_name = line[1] | |
| t[wav_name] = wav_id | |
| processed_train_folder = os.path.join(language,f"{dataset_type}_processed") | |
| os.makedirs(processed_train_folder, exist_ok=True) | |
| for wav_name, wav_id in t.items(): | |
| wav_filepath = os.path.join(train_folder, wav_name) | |
| dest_filepath = os.path.join(processed_train_folder, wav_id + ".wav") | |
| if not os.path.exists(dest_filepath): | |
| shutil.copy(wav_filepath, dest_filepath) | |
| """ | |
| move processed train, dev, test folder to SRC_AUDIO, TGT_AUDIO and rename those folder | |
| """ | |
| for language in language_ls: | |
| for dataset_type in ["train", "dev", "test"]: | |
| src = os.path.join(language,f"{dataset_type}_processed") | |
| dest_folder = mapping[language] | |
| shutil.move(src, dest_folder) | |
| os.rename(os.path.join(dest_folder, f"{dataset_type}_processed"), os.path.join(dest_folder, dataset_type)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment