Rajdave69 · January 23, 2026 15:10
diff --git a/nodia-question-order-fixer.py b/nodia-question-order-fixer.py
 """

 This script works with chapterwise PYQs from NODIA (mcqgpt.com) or whatever their other domains are.

 The questions in within each of the chapters are ordered randomly, perhaps intentionally, with the intention to drive up paperback book sales.

 Either ways, to use this script:
 - pip install PyMuPDF pdfplumber
 - Separate a chapter and name it something.pdf (remove all non-question pages)
 - run the script

 This script was supposed to remove their ADs too, but that part doesn't work

 """




 import re
 import os
 import pdfplumber
 import fitz  # PyMuPDF

 # --- CONFIGURATION ---
 INPUT_FILENAME = "something.pdf"  # Replace with your file
 OUTPUT_FILENAME = "something .pdf"

 # Phrases identifying the banner ads to remove
 AD_PHRASES = [
    "CLICK HERE TO BUY THIS BOOK FROM AMAZON",
    "click here to buy this book from amazon",
    "Downloaded from www.mcqgpt.com",
    "downloaded from www.mcqgpt.com",
    "www.mcqgpt.com"
 ]

 def is_theory_line(line_text):
    """
    Returns True if a numbered line looks like a Theory Header (UPPERCASE).
    Returns False if it looks like a Question (Sentence case).
    """
    

    return False

 def find_questions_start_page(file_path):
    """
    Scans to find the first page where REAL questions begin (Sentence Case),
    skipping Uppercase Theory headers.
    """
    print("Scanning for start of Question section...")
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text: continue
            
            lines = text.split('\n')
            found_question = False
            found_theory = False
            
            for line in lines:
                line = line.strip()
                if re.match(r'^\d+\.', line):
                    if is_theory_line(line):
                        found_theory = True
                    else:
                        # Found a normal sentence-case numbered line -> It's a question
                        return i
    return 0

 def get_sorted_page_indices(file_path, start_page_index):
    """
    Sorts ONLY the question pages based on the question number.
    """
    print(f"Sorting pages starting from Page {start_page_index + 1}...")
    candidates = []

    # 1. Collect Question Numbers
    with pdfplumber.open(file_path) as pdf:
        for i in range(start_page_index, len(pdf.pages)):
            page = pdf.pages[i]
            words = page.extract_words()
            for word in words:
                text = word['text']
                if re.match(r'^\d+\.$', text):
                    try:
                        num = int(text.strip('.'))
                        candidates.append({'page': i, 'num': num, 'x': word['x0']})
                    except ValueError: continue

    if not candidates:
        return list(range(start_page_index, len(pdf.pages)))

    # 2. Filter by Margin (Left-aligned only)
    min_x = min(c['x'] for c in candidates)
    valid_margin = min_x + 15
    print(f"  -> Detected Question Margin at x={min_x:.2f}")

    # 3. Assign Sort Keys
    page_scores = []
    current_key = 0
    
    with pdfplumber.open(file_path) as pdf:
        for i in range(start_page_index, len(pdf.pages)):
            # Find numbers on this page that are within the left margin
            valid_nums = [c['num'] for c in candidates if c['page'] == i and c['x'] <= valid_margin]
            
            if valid_nums:
                current_key = valid_nums[0]
            else:
                current_key += 0.001 # Maintain order for continuation pages
            
            page_scores.append((i, current_key))

    page_scores.sort(key=lambda x: x[1])
    return [p[0] for p in page_scores]

 def process_and_save_pdf(input_path, output_path):
    # --- PHASE 1: Analyze Order ---
    split_index = find_questions_start_page(input_path)
    
    theory_indices = list(range(0, split_index))
    question_indices = get_sorted_page_indices(input_path, split_index)
    
    final_order = theory_indices + question_indices
    print(f"Final Page Order: Theory ({len(theory_indices)}) + Questions ({len(question_indices)})")

    # --- PHASE 2: Reorder & Clean Ads (Using PyMuPDF) ---
    print(f"Generating cleaned PDF: {output_path}...")
    
    doc = fitz.open(input_path)
    
    # 1. Reorder pages
    # doc.select() rearranges the document in-place to the new order
    doc.select(final_order)
    
    # 2. Remove Ads from EVERY page
    for page in doc:
        page_rect = page.rect
        for phrase in AD_PHRASES:
            text_instances = page.search_for(phrase)
            for inst in text_instances:
                # Create a white box covering the full width of the page at that height
                ad_rect = fitz.Rect(0, inst.y0 - 10, page_rect.width, inst.y1 + 10)
                page.add_redact_annot(ad_rect, fill=(1, 1, 1))
        page.apply_redactions()

    # 3. Save
    doc.save(output_path, garbage=4, deflate=True)
    doc.close()
    print("Done! File saved.")

 # --- EXECUTE ---
 if os.path.exists(INPUT_FILENAME):
    process_and_save_pdf(INPUT_FILENAME, OUTPUT_FILENAME)
 else:
    print(f"Error: Could not find '{INPUT_FILENAME}'")
	"""

	This script works with chapterwise PYQs from NODIA (mcqgpt.com) or whatever their other domains are.

	The questions in within each of the chapters are ordered randomly, perhaps intentionally, with the intention to drive up paperback book sales.

	Either ways, to use this script:
	- pip install PyMuPDF pdfplumber
	- Separate a chapter and name it something.pdf (remove all non-question pages)
	- run the script

	This script was supposed to remove their ADs too, but that part doesn't work

	"""




	import re
	import os
	import pdfplumber
	import fitz # PyMuPDF

	# --- CONFIGURATION ---
	INPUT_FILENAME = "something.pdf" # Replace with your file
	OUTPUT_FILENAME = "something .pdf"

	# Phrases identifying the banner ads to remove
	AD_PHRASES = [
	"CLICK HERE TO BUY THIS BOOK FROM AMAZON",
	"click here to buy this book from amazon",
	"Downloaded from www.mcqgpt.com",
	"downloaded from www.mcqgpt.com",
	"www.mcqgpt.com"
	]

	def is_theory_line(line_text):
	"""
	Returns True if a numbered line looks like a Theory Header (UPPERCASE).
	Returns False if it looks like a Question (Sentence case).
	"""


	return False

	def find_questions_start_page(file_path):
	"""
	Scans to find the first page where REAL questions begin (Sentence Case),
	skipping Uppercase Theory headers.
	"""
	print("Scanning for start of Question section...")
	with pdfplumber.open(file_path) as pdf:
	for i, page in enumerate(pdf.pages):
	text = page.extract_text()
	if not text: continue

	lines = text.split('\n')
	found_question = False
	found_theory = False

	for line in lines:
	line = line.strip()
	if re.match(r'^\d+\.', line):
	if is_theory_line(line):
	found_theory = True
	else:
	# Found a normal sentence-case numbered line -> It's a question
	return i
	return 0

	def get_sorted_page_indices(file_path, start_page_index):
	"""
	Sorts ONLY the question pages based on the question number.
	"""
	print(f"Sorting pages starting from Page {start_page_index + 1}...")
	candidates = []

	# 1. Collect Question Numbers
	with pdfplumber.open(file_path) as pdf:
	for i in range(start_page_index, len(pdf.pages)):
	page = pdf.pages[i]
	words = page.extract_words()
	for word in words:
	text = word['text']
	if re.match(r'^\d+\.$', text):
	try:
	num = int(text.strip('.'))
	candidates.append({'page': i, 'num': num, 'x': word['x0']})
	except ValueError: continue

	if not candidates:
	return list(range(start_page_index, len(pdf.pages)))

	# 2. Filter by Margin (Left-aligned only)
	min_x = min(c['x'] for c in candidates)
	valid_margin = min_x + 15
	print(f" -> Detected Question Margin at x={min_x:.2f}")

	# 3. Assign Sort Keys
	page_scores = []
	current_key = 0

	with pdfplumber.open(file_path) as pdf:
	for i in range(start_page_index, len(pdf.pages)):
	# Find numbers on this page that are within the left margin
	valid_nums = [c['num'] for c in candidates if c['page'] == i and c['x'] <= valid_margin]

	if valid_nums:
	current_key = valid_nums[0]
	else:
	current_key += 0.001 # Maintain order for continuation pages

	page_scores.append((i, current_key))

	page_scores.sort(key=lambda x: x[1])
	return [p[0] for p in page_scores]

	def process_and_save_pdf(input_path, output_path):
	# --- PHASE 1: Analyze Order ---
	split_index = find_questions_start_page(input_path)

	theory_indices = list(range(0, split_index))
	question_indices = get_sorted_page_indices(input_path, split_index)

	final_order = theory_indices + question_indices
	print(f"Final Page Order: Theory ({len(theory_indices)}) + Questions ({len(question_indices)})")

	# --- PHASE 2: Reorder & Clean Ads (Using PyMuPDF) ---
	print(f"Generating cleaned PDF: {output_path}...")

	doc = fitz.open(input_path)

	# 1. Reorder pages
	# doc.select() rearranges the document in-place to the new order
	doc.select(final_order)

	# 2. Remove Ads from EVERY page
	for page in doc:
	page_rect = page.rect
	for phrase in AD_PHRASES:
	text_instances = page.search_for(phrase)
	for inst in text_instances:
	# Create a white box covering the full width of the page at that height
	ad_rect = fitz.Rect(0, inst.y0 - 10, page_rect.width, inst.y1 + 10)
	page.add_redact_annot(ad_rect, fill=(1, 1, 1))
	page.apply_redactions()

	# 3. Save
	doc.save(output_path, garbage=4, deflate=True)
	doc.close()
	print("Done! File saved.")

	# --- EXECUTE ---
	if os.path.exists(INPUT_FILENAME):
	process_and_save_pdf(INPUT_FILENAME, OUTPUT_FILENAME)
	else:
	print(f"Error: Could not find '{INPUT_FILENAME}'")
No results found