Skip to content

Instantly share code, notes, and snippets.

@gabrielqmatos88
Created July 2, 2025 10:55
Show Gist options
  • Select an option

  • Save gabrielqmatos88/d7c2bf74ae53a487bbab3fdb749a1eca to your computer and use it in GitHub Desktop.

Select an option

Save gabrielqmatos88/d7c2bf74ae53a487bbab3fdb749a1eca to your computer and use it in GitHub Desktop.
Python script to generate a sample of dataset keeping the proportion of question levels
import numpy as np
import math
from typing import List, Dict
def get_questions_by_level(questions: list[dict], level: str):
return [q for q in questions if q["level"] == level]
# Step 1: Generate 200 dummy questions with labeled difficulty
def generate_questions() -> List[Dict]:
questions = []
levels = ['easy', 'medium', 'hard']
distribution = [189,84,29]
q_levels = []
for i,d in enumerate(distribution):
q_levels.extend([levels[i] for v in range(d)])
# q_levels = [random.choice(levels) for v in range(n)]
questions = [{"question": f"Question {i} - {level}" , "level": level, "index": i } for i,level in enumerate(q_levels)]
return questions
# Step 2: Sample question indexes by level and given percentage
def sample_question_indexes(questions: List[Dict], percentage: float) -> List[int]:
levels = ['easy', 'medium', 'hard']
sampled_indexes = []
for level in levels:
indexes = [q["index"] for q in questions if q["level"] == level]
sample_size = math.ceil(len(indexes) * (percentage / 100.0))
sampled = list(np.random.choice(indexes, size=sample_size, replace=False))
sampled_indexes.extend(sampled)
return sorted(sampled_indexes) # preserve ascending index order
# Step 3: Retrieve questions by sampled indexes
def get_questions_by_indexes(questions: List[Dict], indexes: List[int]) -> List[Dict]:
index_set = set(indexes)
return [q for q in questions if q["index"] in index_set]
# Step 4: Log the count of questions and the expected proportion
def log_question_stats(questions: List[Dict], percentage: float, level: str):
filtered = get_questions_by_level(questions, level=level)
count = len(filtered)
sample_size = math.ceil(len(filtered) * (percentage / 100.0))
print(f"Questions {level}: {count}")
print(f"Samples: {sample_size}")
return count, sample_size
# --- Example usage ---
if __name__ == "__main__":
perc = 20
lmap = {
'easy': 1,
'medium': 2,
'hard': 3
}
all_questions = generate_questions()
log_question_stats(all_questions, percentage=perc, level='easy')
log_question_stats(all_questions, percentage=perc, level='medium')
log_question_stats(all_questions, percentage=perc, level='hard')
# easy_questions = get_questions_by_level(all_questions, level='easy')
# medium_questions = get_questions_by_level(all_questions, level='medium')
# hard_questions = get_questions_by_level(all_questions, level='hard')
sampled_indexes = sample_question_indexes(all_questions, percentage=perc)
sampled_questions = get_questions_by_indexes(all_questions, sampled_indexes)
# Sort the sampled questions by level
sampled_questions.sort(key=lambda q: lmap.get(q['level']))
print("\nSampled Questions:")
for i,q in enumerate(sampled_questions):
print(f"{i+1} - {q['question']} - index: {q['index']}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment