Created
July 2, 2025 10:55
-
-
Save gabrielqmatos88/d7c2bf74ae53a487bbab3fdb749a1eca to your computer and use it in GitHub Desktop.
Python script to generate a sample of dataset keeping the proportion of question levels
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import math | |
| from typing import List, Dict | |
| def get_questions_by_level(questions: list[dict], level: str): | |
| return [q for q in questions if q["level"] == level] | |
| # Step 1: Generate 200 dummy questions with labeled difficulty | |
| def generate_questions() -> List[Dict]: | |
| questions = [] | |
| levels = ['easy', 'medium', 'hard'] | |
| distribution = [189,84,29] | |
| q_levels = [] | |
| for i,d in enumerate(distribution): | |
| q_levels.extend([levels[i] for v in range(d)]) | |
| # q_levels = [random.choice(levels) for v in range(n)] | |
| questions = [{"question": f"Question {i} - {level}" , "level": level, "index": i } for i,level in enumerate(q_levels)] | |
| return questions | |
| # Step 2: Sample question indexes by level and given percentage | |
| def sample_question_indexes(questions: List[Dict], percentage: float) -> List[int]: | |
| levels = ['easy', 'medium', 'hard'] | |
| sampled_indexes = [] | |
| for level in levels: | |
| indexes = [q["index"] for q in questions if q["level"] == level] | |
| sample_size = math.ceil(len(indexes) * (percentage / 100.0)) | |
| sampled = list(np.random.choice(indexes, size=sample_size, replace=False)) | |
| sampled_indexes.extend(sampled) | |
| return sorted(sampled_indexes) # preserve ascending index order | |
| # Step 3: Retrieve questions by sampled indexes | |
| def get_questions_by_indexes(questions: List[Dict], indexes: List[int]) -> List[Dict]: | |
| index_set = set(indexes) | |
| return [q for q in questions if q["index"] in index_set] | |
| # Step 4: Log the count of questions and the expected proportion | |
| def log_question_stats(questions: List[Dict], percentage: float, level: str): | |
| filtered = get_questions_by_level(questions, level=level) | |
| count = len(filtered) | |
| sample_size = math.ceil(len(filtered) * (percentage / 100.0)) | |
| print(f"Questions {level}: {count}") | |
| print(f"Samples: {sample_size}") | |
| return count, sample_size | |
| # --- Example usage --- | |
| if __name__ == "__main__": | |
| perc = 20 | |
| lmap = { | |
| 'easy': 1, | |
| 'medium': 2, | |
| 'hard': 3 | |
| } | |
| all_questions = generate_questions() | |
| log_question_stats(all_questions, percentage=perc, level='easy') | |
| log_question_stats(all_questions, percentage=perc, level='medium') | |
| log_question_stats(all_questions, percentage=perc, level='hard') | |
| # easy_questions = get_questions_by_level(all_questions, level='easy') | |
| # medium_questions = get_questions_by_level(all_questions, level='medium') | |
| # hard_questions = get_questions_by_level(all_questions, level='hard') | |
| sampled_indexes = sample_question_indexes(all_questions, percentage=perc) | |
| sampled_questions = get_questions_by_indexes(all_questions, sampled_indexes) | |
| # Sort the sampled questions by level | |
| sampled_questions.sort(key=lambda q: lmap.get(q['level'])) | |
| print("\nSampled Questions:") | |
| for i,q in enumerate(sampled_questions): | |
| print(f"{i+1} - {q['question']} - index: {q['index']}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment