Skip to content

Instantly share code, notes, and snippets.

@saifulbkhan
Created October 11, 2024 13:49
Show Gist options
  • Select an option

  • Save saifulbkhan/2d49d1105554f86093398f0a6275b257 to your computer and use it in GitHub Desktop.

Select an option

Save saifulbkhan/2d49d1105554f86093398f0a6275b257 to your computer and use it in GitHub Desktop.
A script to generate dummy data into a MongoDB cluster
# Needs:
# mimesis==18.0.0
# tqdm==4.66.5
import random
import time
from mimesis import Field, Fieldset, Schema
from mimesis.enums import Gender, TimestampFormat
from mimesis.locales import Locale
from pymongo import MongoClient
from tqdm import tqdm
db1_host = "localhost"
db1_port = 27017
db2_host = "localhost"
db2_port = 27018
field = Field(Locale.EN, seed=0xff)
fieldset = Fieldset(Locale.EN, seed=0xff)
def generate_dummy_batch(num_docs):
assets_schema = lambda: {
"uid": field("uuid"),
"name": field("text.word"),
"version": field("version"),
"timestamp": field("timestamp", fmt=TimestampFormat.POSIX),
"owner": {
"email": field("person.email", domains=["mimesis.name"]),
"creator": field("full_name", gender=Gender.FEMALE),
},
"apiKeys": fieldset("token_hex", key=lambda s: s[:16], i=3),
}
assets_schema = Schema(schema=assets_schema, iterations=num_docs)
return assets_schema.create()
def seed():
mongo1 = MongoClient(db1_host, db1_port)
db1 = mongo1.dummy
mongo2 = MongoClient(db2_host, db2_port)
db2 = mongo2.dummy
miss_rate = 0.1
num_batches = 100
num_docs_per_batch = 200
missed_docs = {"db1": [], "db2": []}
for _ in tqdm(range(num_batches)):
batch = generate_dummy_batch(num_docs_per_batch)
if random.random() < miss_rate:
if random.random() < 0.5:
db1.data.insert_many(batch)
db2.data.insert_many(batch[:-1])
missed_docs["db2"].append(batch[-1])
else:
db1.data.insert_many(batch[:-1])
db2.data.insert_many(batch)
missed_docs["db1"].append(batch[-1])
else:
db1.data.insert_many(batch)
db2.data.insert_many(batch)
time.sleep(1.0)
# Here we reinsert the documents missed in previous seeding step
time.sleep(10.0)
if missed_docs["db1"]:
db1.data.insert_many(missed_docs["db1"])
if missed_docs["db2"]:
db2.data.insert_many(missed_docs["db2"])
mongo1.close()
mongo2.close()
if __name__ == '__main__':
num_rounds = 3
for i in range(num_rounds):
print(f"Round {i + 1}, seeding data...")
seed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment