Created
June 23, 2025 00:38
-
-
Save xarical/bf83744749458a60f4a21c596fdc7bb1 to your computer and use it in GitHub Desktop.
Using a Hugging Face dataset as a database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Dependencies: datasets, huggingface_hub | |
| Environment variables: HF_API_KEY, DATASET_ID | |
| """ | |
| import os | |
| import json | |
| from datasets import load_dataset | |
| from huggingface_hub import HfApi | |
| dataset_id = os.environ["DATASET_ID"] # ID of the HF dataset, expected type: string | |
| hf_api_key = os.environ["HF_API_KEY"] # HF API key to access the dataset, expected type: string | |
| def load_ds() -> list[dict]: | |
| """ | |
| Load and process the user_data from the HF dataset | |
| """ | |
| # Try/except: if dataset doesn't exist or is empty, return an empty list | |
| try: | |
| # Load the dataset and convert the data to a dictionary | |
| dataset = load_dataset(dataset_id, token=hf_api_key) | |
| unprocessed_data = dataset["train"].to_dict() | |
| # Process the data; changes it from | |
| # { | |
| # "name": ["A", "B", "C"], | |
| # ... | |
| # "amount": [100, 200, 300], | |
| # } | |
| # to | |
| # [ | |
| # {"name": "A", "amount": 100}, | |
| # ... | |
| # {"name": "C", "amount": 300}, | |
| # ] | |
| user_data = [] | |
| for i in range(len(unprocessed_data["name"])): | |
| user_data.append({ | |
| "name": unprocessed_data["name"][i], | |
| "category": unprocessed_data["category"][i], | |
| "date": unprocessed_data["date"][i], | |
| "amount": unprocessed_data["amount"][i], | |
| }) | |
| except Exception as e: | |
| print("WARNING: dataset is empty or does not exist(?):", e) | |
| user_data = [] | |
| return user_data | |
| def update_ds(user_data: list[dict]) -> None: | |
| """ | |
| Update the HF dataset with the user_data | |
| """ | |
| # Filter user_data and then dump into a data.json file | |
| with open('data.json', 'w') as file: | |
| json.dump(user_data, file, indent=4) | |
| # Upload data.json to the HF dataset | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj="data.json", | |
| path_in_repo="data.json", | |
| repo_id=dataset_id, | |
| repo_type="dataset", | |
| commit_message="Update data.json 🤖", | |
| token=hf_api_key | |
| ) | |
| print("Database updated!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment