Last active
November 18, 2025 19:23
-
-
Save phucdev/a157ed9a36599cf750936af1868ccaa4 to your computer and use it in GitHub Desktop.
Convert ClassLabel/Sequence[ClassLabel] to string labels for HuggingFace Datasets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datasets import Dataset, ClassLabel, Value, load_dataset | |
| def convert_class_labels_to_str(examples: Dataset): | |
| """ | |
| Utility function to turn (shallow) ClassLabel indices to string labels. | |
| This is common for datasets hosted on the Hugging Face datasets hub with data loading scripts, where the label(s) | |
| are stored as ClassLabel or Sequence[ClassLabel] objects. | |
| Often times we are interested in the string labels rather than the indices. | |
| If any ClassLabel feature is embedded in a nested structure like a dict this will not work | |
| :param examples: HuggingFace Dataset | |
| :return: Dataset with string labels | |
| """ | |
| feature_mappings = {} | |
| for feature_name, feature in examples.features.items(): | |
| if isinstance(feature, ClassLabel): | |
| feature_mappings[feature_name] = feature.names | |
| # examples.features[feature_name] = Value(dtype="string") | |
| elif hasattr(feature, "dtype") and feature.dtype == "list" and isinstance(feature.feature, ClassLabel): | |
| feature_mappings[feature_name] = feature.feature.names | |
| # examples.features[feature_name].feature = Value(dtype="string") | |
| converted_examples = [] | |
| for example in examples: | |
| for feature_name, str_labels in feature_mappings.items(): | |
| if feature_name not in example: | |
| continue | |
| tags = example.pop(feature_name) | |
| if isinstance(tags, int): | |
| str_tags = str_labels[tags] | |
| else: | |
| str_tags = [str_labels[tag] for tag in tags] | |
| example[feature_name] = str_tags | |
| converted_examples.append(example) | |
| return Dataset.from_list(converted_examples) | |
| def main(): | |
| # Example for conll2003 | |
| conll = load_dataset("conll2003", split="train", trust_remote_code=True) | |
| converted_conll = convert_class_labels_to_str(conll) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment