Created
January 23, 2026 15:10
-
-
Save Rajdave69/ce57e81f27e1b5a3016cf274c3e899fe to your computer and use it in GitHub Desktop.
This script reorders the questions from NODIA's chapterwise PYQs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| This script works with chapterwise PYQs from NODIA (mcqgpt.com) or whatever their other domains are. | |
| The questions in within each of the chapters are ordered randomly, perhaps intentionally, with the intention to drive up paperback book sales. | |
| Either ways, to use this script: | |
| - pip install PyMuPDF pdfplumber | |
| - Separate a chapter and name it something.pdf (remove all non-question pages) | |
| - run the script | |
| This script was supposed to remove their ADs too, but that part doesn't work | |
| """ | |
| import re | |
| import os | |
| import pdfplumber | |
| import fitz # PyMuPDF | |
| # --- CONFIGURATION --- | |
| INPUT_FILENAME = "something.pdf" # Replace with your file | |
| OUTPUT_FILENAME = "something .pdf" | |
| # Phrases identifying the banner ads to remove | |
| AD_PHRASES = [ | |
| "CLICK HERE TO BUY THIS BOOK FROM AMAZON", | |
| "click here to buy this book from amazon", | |
| "Downloaded from www.mcqgpt.com", | |
| "downloaded from www.mcqgpt.com", | |
| "www.mcqgpt.com" | |
| ] | |
| def is_theory_line(line_text): | |
| """ | |
| Returns True if a numbered line looks like a Theory Header (UPPERCASE). | |
| Returns False if it looks like a Question (Sentence case). | |
| """ | |
| return False | |
| def find_questions_start_page(file_path): | |
| """ | |
| Scans to find the first page where REAL questions begin (Sentence Case), | |
| skipping Uppercase Theory headers. | |
| """ | |
| print("Scanning for start of Question section...") | |
| with pdfplumber.open(file_path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| text = page.extract_text() | |
| if not text: continue | |
| lines = text.split('\n') | |
| found_question = False | |
| found_theory = False | |
| for line in lines: | |
| line = line.strip() | |
| if re.match(r'^\d+\.', line): | |
| if is_theory_line(line): | |
| found_theory = True | |
| else: | |
| # Found a normal sentence-case numbered line -> It's a question | |
| return i | |
| return 0 | |
| def get_sorted_page_indices(file_path, start_page_index): | |
| """ | |
| Sorts ONLY the question pages based on the question number. | |
| """ | |
| print(f"Sorting pages starting from Page {start_page_index + 1}...") | |
| candidates = [] | |
| # 1. Collect Question Numbers | |
| with pdfplumber.open(file_path) as pdf: | |
| for i in range(start_page_index, len(pdf.pages)): | |
| page = pdf.pages[i] | |
| words = page.extract_words() | |
| for word in words: | |
| text = word['text'] | |
| if re.match(r'^\d+\.$', text): | |
| try: | |
| num = int(text.strip('.')) | |
| candidates.append({'page': i, 'num': num, 'x': word['x0']}) | |
| except ValueError: continue | |
| if not candidates: | |
| return list(range(start_page_index, len(pdf.pages))) | |
| # 2. Filter by Margin (Left-aligned only) | |
| min_x = min(c['x'] for c in candidates) | |
| valid_margin = min_x + 15 | |
| print(f" -> Detected Question Margin at x={min_x:.2f}") | |
| # 3. Assign Sort Keys | |
| page_scores = [] | |
| current_key = 0 | |
| with pdfplumber.open(file_path) as pdf: | |
| for i in range(start_page_index, len(pdf.pages)): | |
| # Find numbers on this page that are within the left margin | |
| valid_nums = [c['num'] for c in candidates if c['page'] == i and c['x'] <= valid_margin] | |
| if valid_nums: | |
| current_key = valid_nums[0] | |
| else: | |
| current_key += 0.001 # Maintain order for continuation pages | |
| page_scores.append((i, current_key)) | |
| page_scores.sort(key=lambda x: x[1]) | |
| return [p[0] for p in page_scores] | |
| def process_and_save_pdf(input_path, output_path): | |
| # --- PHASE 1: Analyze Order --- | |
| split_index = find_questions_start_page(input_path) | |
| theory_indices = list(range(0, split_index)) | |
| question_indices = get_sorted_page_indices(input_path, split_index) | |
| final_order = theory_indices + question_indices | |
| print(f"Final Page Order: Theory ({len(theory_indices)}) + Questions ({len(question_indices)})") | |
| # --- PHASE 2: Reorder & Clean Ads (Using PyMuPDF) --- | |
| print(f"Generating cleaned PDF: {output_path}...") | |
| doc = fitz.open(input_path) | |
| # 1. Reorder pages | |
| # doc.select() rearranges the document in-place to the new order | |
| doc.select(final_order) | |
| # 2. Remove Ads from EVERY page | |
| for page in doc: | |
| page_rect = page.rect | |
| for phrase in AD_PHRASES: | |
| text_instances = page.search_for(phrase) | |
| for inst in text_instances: | |
| # Create a white box covering the full width of the page at that height | |
| ad_rect = fitz.Rect(0, inst.y0 - 10, page_rect.width, inst.y1 + 10) | |
| page.add_redact_annot(ad_rect, fill=(1, 1, 1)) | |
| page.apply_redactions() | |
| # 3. Save | |
| doc.save(output_path, garbage=4, deflate=True) | |
| doc.close() | |
| print("Done! File saved.") | |
| # --- EXECUTE --- | |
| if os.path.exists(INPUT_FILENAME): | |
| process_and_save_pdf(INPUT_FILENAME, OUTPUT_FILENAME) | |
| else: | |
| print(f"Error: Could not find '{INPUT_FILENAME}'") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment