Skip to content

Instantly share code, notes, and snippets.

@larsvilhuber
Created October 17, 2025 15:35
Show Gist options
  • Select an option

  • Save larsvilhuber/2f3b1377d8dbc5d04932c71aab162a88 to your computer and use it in GitHub Desktop.

Select an option

Save larsvilhuber/2f3b1377d8dbc5d04932c71aab162a88 to your computer and use it in GitHub Desktop.
Create synthetic data with SDV
import pandas as pd
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import os
# created based on simple example at https://docs.sdv.dev/sdv/explore/sdv-community
# Read my Stata format data from the same directory as this code
current_dir = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(current_dir, 'Individual.dta')
metadata_path = os.path.join(current_dir, 'Individual_metadata.json')
data = pd.read_stata(data_path)
# for testing, create a smaller dataset
data = data.sample(n=1000, random_state=42).reset_index(drop=True)
# print first few rows of the original data
print("======== Original Data ======== ")
print(data.head())
metadata = Metadata.detect_from_dataframe(data)
# Print the detected metadata
print("======== Detected Metadata ======== ")
print(metadata.to_dict())
# if there is no metadata file, save the detected metadata to a JSON file
if not os.path.exists(metadata_path):
metadata.save_to_json(metadata_path, mode='overwrite')
# Updated metadata manually. Used https://faker.readthedocs.io/en/master/providers/faker.providers.company.html to define company (roughly)
# in the future, you can reload the metadata object from the file
metadata = Metadata.load_from_json(metadata_path)
# print the loaded metadata
print("======== Loaded Metadata ======== ")
print(metadata.to_dict())
# create and fit the synthesizer
print("Fitting the synthesizer...")
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data)
# draw synthetic data of the same size as the original data
num_rows = data.shape[0]
synthetic_data = synthesizer.sample(num_rows=num_rows)
# save the synthetic data to a Stata file with the original file name plus '_synthetic'
synthetic_data_path = os.path.join(current_dir, 'Individual_synthetic.dta')
synthetic_data.to_stata(synthetic_data_path, write_index=False)
print(f"Synthetic data saved to {synthetic_data_path}")
# Optionally, print first few rows of the synthetic data
print("======== Synthetic Data ======== ")
print(synthetic_data.head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment