Skip to content

Instantly share code, notes, and snippets.

@phucdev
Last active November 18, 2025 19:23
Show Gist options
  • Select an option

  • Save phucdev/a157ed9a36599cf750936af1868ccaa4 to your computer and use it in GitHub Desktop.

Select an option

Save phucdev/a157ed9a36599cf750936af1868ccaa4 to your computer and use it in GitHub Desktop.
Convert ClassLabel/Sequence[ClassLabel] to string labels for HuggingFace Datasets
from datasets import Dataset, ClassLabel, Value, load_dataset
def convert_class_labels_to_str(examples: Dataset):
"""
Utility function to turn (shallow) ClassLabel indices to string labels.
This is common for datasets hosted on the Hugging Face datasets hub with data loading scripts, where the label(s)
are stored as ClassLabel or Sequence[ClassLabel] objects.
Often times we are interested in the string labels rather than the indices.
If any ClassLabel feature is embedded in a nested structure like a dict this will not work
:param examples: HuggingFace Dataset
:return: Dataset with string labels
"""
feature_mappings = {}
for feature_name, feature in examples.features.items():
if isinstance(feature, ClassLabel):
feature_mappings[feature_name] = feature.names
# examples.features[feature_name] = Value(dtype="string")
elif hasattr(feature, "dtype") and feature.dtype == "list" and isinstance(feature.feature, ClassLabel):
feature_mappings[feature_name] = feature.feature.names
# examples.features[feature_name].feature = Value(dtype="string")
converted_examples = []
for example in examples:
for feature_name, str_labels in feature_mappings.items():
if feature_name not in example:
continue
tags = example.pop(feature_name)
if isinstance(tags, int):
str_tags = str_labels[tags]
else:
str_tags = [str_labels[tag] for tag in tags]
example[feature_name] = str_tags
converted_examples.append(example)
return Dataset.from_list(converted_examples)
def main():
# Example for conll2003
conll = load_dataset("conll2003", split="train", trust_remote_code=True)
converted_conll = convert_class_labels_to_str(conll)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment