Skip to content

Instantly share code, notes, and snippets.

@lewtun
Created July 4, 2025 11:41
Show Gist options
  • Select an option

  • Save lewtun/35b2231c485f1a2784898b084db48724 to your computer and use it in GitHub Desktop.

Select an option

Save lewtun/35b2231c485f1a2784898b084db48724 to your computer and use it in GitHub Desktop.
# pip install emoji
import argparse
from datasets import load_dataset
import emoji
def remove_emoji(text: str) -> str:
return emoji.replace_emoji(text, replace='').strip()
def format_messages(x):
emojis_found = False
for idx, msg in enumerate(x["messages"]):
if msg["role"] == "assistant":
original_content = x["messages"][idx]["content"]
cleaned_content = remove_emoji(original_content)
if original_content != cleaned_content:
emojis_found = True
x["messages"][idx]["content"] = cleaned_content
x["_emojis_found"] = emojis_found
return x
def main():
parser = argparse.ArgumentParser(description="Remove emojis from dataset assistant messages")
parser.add_argument("--dataset_id", help="Dataset ID to process")
parser.add_argument("--config", default="default", help="Config name (default: default)")
args = parser.parse_args()
print(f"Loading dataset: {args.dataset_id}, config: {args.config}")
ds = load_dataset(args.dataset_id, args.config, split="train")
# Process the dataset
ds = ds.map(format_messages, num_proc=32)
# Check if any emojis were found
emojis_found = any(ds["_emojis_found"])
if emojis_found:
print(f"Emojis found and removed from assistant messages. Removed emojis from {sum(ds['_emojis_found'])} messages.")
# Remove the temporary column
ds = ds.remove_columns(["_emojis_found"])
# Create new config name
new_config = f"{args.config}_no_emoji"
print(f"Pushing to hub with new config: {new_config}")
ds.push_to_hub(args.dataset_id, config_name=new_config)
else:
print("No emojis found in assistant messages")
# Remove the temporary column
ds = ds.remove_columns(["_emojis_found"])
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment