Skip to content

Instantly share code, notes, and snippets.

@xarical
Created June 23, 2025 00:38
Show Gist options
  • Select an option

  • Save xarical/bf83744749458a60f4a21c596fdc7bb1 to your computer and use it in GitHub Desktop.

Select an option

Save xarical/bf83744749458a60f4a21c596fdc7bb1 to your computer and use it in GitHub Desktop.
Using a Hugging Face dataset as a database
"""
Dependencies: datasets, huggingface_hub
Environment variables: HF_API_KEY, DATASET_ID
"""
import os
import json
from datasets import load_dataset
from huggingface_hub import HfApi
dataset_id = os.environ["DATASET_ID"] # ID of the HF dataset, expected type: string
hf_api_key = os.environ["HF_API_KEY"] # HF API key to access the dataset, expected type: string
def load_ds() -> list[dict]:
"""
Load and process the user_data from the HF dataset
"""
# Try/except: if dataset doesn't exist or is empty, return an empty list
try:
# Load the dataset and convert the data to a dictionary
dataset = load_dataset(dataset_id, token=hf_api_key)
unprocessed_data = dataset["train"].to_dict()
# Process the data; changes it from
# {
# "name": ["A", "B", "C"],
# ...
# "amount": [100, 200, 300],
# }
# to
# [
# {"name": "A", "amount": 100},
# ...
# {"name": "C", "amount": 300},
# ]
user_data = []
for i in range(len(unprocessed_data["name"])):
user_data.append({
"name": unprocessed_data["name"][i],
"category": unprocessed_data["category"][i],
"date": unprocessed_data["date"][i],
"amount": unprocessed_data["amount"][i],
})
except Exception as e:
print("WARNING: dataset is empty or does not exist(?):", e)
user_data = []
return user_data
def update_ds(user_data: list[dict]) -> None:
"""
Update the HF dataset with the user_data
"""
# Filter user_data and then dump into a data.json file
with open('data.json', 'w') as file:
json.dump(user_data, file, indent=4)
# Upload data.json to the HF dataset
api = HfApi()
api.upload_file(
path_or_fileobj="data.json",
path_in_repo="data.json",
repo_id=dataset_id,
repo_type="dataset",
commit_message="Update data.json 🤖",
token=hf_api_key
)
print("Database updated!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment