Created
July 4, 2025 11:41
-
-
Save lewtun/35b2231c485f1a2784898b084db48724 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # pip install emoji | |
| import argparse | |
| from datasets import load_dataset | |
| import emoji | |
| def remove_emoji(text: str) -> str: | |
| return emoji.replace_emoji(text, replace='').strip() | |
| def format_messages(x): | |
| emojis_found = False | |
| for idx, msg in enumerate(x["messages"]): | |
| if msg["role"] == "assistant": | |
| original_content = x["messages"][idx]["content"] | |
| cleaned_content = remove_emoji(original_content) | |
| if original_content != cleaned_content: | |
| emojis_found = True | |
| x["messages"][idx]["content"] = cleaned_content | |
| x["_emojis_found"] = emojis_found | |
| return x | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Remove emojis from dataset assistant messages") | |
| parser.add_argument("--dataset_id", help="Dataset ID to process") | |
| parser.add_argument("--config", default="default", help="Config name (default: default)") | |
| args = parser.parse_args() | |
| print(f"Loading dataset: {args.dataset_id}, config: {args.config}") | |
| ds = load_dataset(args.dataset_id, args.config, split="train") | |
| # Process the dataset | |
| ds = ds.map(format_messages, num_proc=32) | |
| # Check if any emojis were found | |
| emojis_found = any(ds["_emojis_found"]) | |
| if emojis_found: | |
| print(f"Emojis found and removed from assistant messages. Removed emojis from {sum(ds['_emojis_found'])} messages.") | |
| # Remove the temporary column | |
| ds = ds.remove_columns(["_emojis_found"]) | |
| # Create new config name | |
| new_config = f"{args.config}_no_emoji" | |
| print(f"Pushing to hub with new config: {new_config}") | |
| ds.push_to_hub(args.dataset_id, config_name=new_config) | |
| else: | |
| print("No emojis found in assistant messages") | |
| # Remove the temporary column | |
| ds = ds.remove_columns(["_emojis_found"]) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment