Skip to content

Instantly share code, notes, and snippets.

@Rajdave69
Created January 23, 2026 15:10
Show Gist options
  • Select an option

  • Save Rajdave69/ce57e81f27e1b5a3016cf274c3e899fe to your computer and use it in GitHub Desktop.

Select an option

Save Rajdave69/ce57e81f27e1b5a3016cf274c3e899fe to your computer and use it in GitHub Desktop.
This script reorders the questions from NODIA's chapterwise PYQs
"""
This script works with chapterwise PYQs from NODIA (mcqgpt.com) or whatever their other domains are.
The questions in within each of the chapters are ordered randomly, perhaps intentionally, with the intention to drive up paperback book sales.
Either ways, to use this script:
- pip install PyMuPDF pdfplumber
- Separate a chapter and name it something.pdf (remove all non-question pages)
- run the script
This script was supposed to remove their ADs too, but that part doesn't work
"""
import re
import os
import pdfplumber
import fitz # PyMuPDF
# --- CONFIGURATION ---
INPUT_FILENAME = "something.pdf" # Replace with your file
OUTPUT_FILENAME = "something .pdf"
# Phrases identifying the banner ads to remove
AD_PHRASES = [
"CLICK HERE TO BUY THIS BOOK FROM AMAZON",
"click here to buy this book from amazon",
"Downloaded from www.mcqgpt.com",
"downloaded from www.mcqgpt.com",
"www.mcqgpt.com"
]
def is_theory_line(line_text):
"""
Returns True if a numbered line looks like a Theory Header (UPPERCASE).
Returns False if it looks like a Question (Sentence case).
"""
return False
def find_questions_start_page(file_path):
"""
Scans to find the first page where REAL questions begin (Sentence Case),
skipping Uppercase Theory headers.
"""
print("Scanning for start of Question section...")
with pdfplumber.open(file_path) as pdf:
for i, page in enumerate(pdf.pages):
text = page.extract_text()
if not text: continue
lines = text.split('\n')
found_question = False
found_theory = False
for line in lines:
line = line.strip()
if re.match(r'^\d+\.', line):
if is_theory_line(line):
found_theory = True
else:
# Found a normal sentence-case numbered line -> It's a question
return i
return 0
def get_sorted_page_indices(file_path, start_page_index):
"""
Sorts ONLY the question pages based on the question number.
"""
print(f"Sorting pages starting from Page {start_page_index + 1}...")
candidates = []
# 1. Collect Question Numbers
with pdfplumber.open(file_path) as pdf:
for i in range(start_page_index, len(pdf.pages)):
page = pdf.pages[i]
words = page.extract_words()
for word in words:
text = word['text']
if re.match(r'^\d+\.$', text):
try:
num = int(text.strip('.'))
candidates.append({'page': i, 'num': num, 'x': word['x0']})
except ValueError: continue
if not candidates:
return list(range(start_page_index, len(pdf.pages)))
# 2. Filter by Margin (Left-aligned only)
min_x = min(c['x'] for c in candidates)
valid_margin = min_x + 15
print(f" -> Detected Question Margin at x={min_x:.2f}")
# 3. Assign Sort Keys
page_scores = []
current_key = 0
with pdfplumber.open(file_path) as pdf:
for i in range(start_page_index, len(pdf.pages)):
# Find numbers on this page that are within the left margin
valid_nums = [c['num'] for c in candidates if c['page'] == i and c['x'] <= valid_margin]
if valid_nums:
current_key = valid_nums[0]
else:
current_key += 0.001 # Maintain order for continuation pages
page_scores.append((i, current_key))
page_scores.sort(key=lambda x: x[1])
return [p[0] for p in page_scores]
def process_and_save_pdf(input_path, output_path):
# --- PHASE 1: Analyze Order ---
split_index = find_questions_start_page(input_path)
theory_indices = list(range(0, split_index))
question_indices = get_sorted_page_indices(input_path, split_index)
final_order = theory_indices + question_indices
print(f"Final Page Order: Theory ({len(theory_indices)}) + Questions ({len(question_indices)})")
# --- PHASE 2: Reorder & Clean Ads (Using PyMuPDF) ---
print(f"Generating cleaned PDF: {output_path}...")
doc = fitz.open(input_path)
# 1. Reorder pages
# doc.select() rearranges the document in-place to the new order
doc.select(final_order)
# 2. Remove Ads from EVERY page
for page in doc:
page_rect = page.rect
for phrase in AD_PHRASES:
text_instances = page.search_for(phrase)
for inst in text_instances:
# Create a white box covering the full width of the page at that height
ad_rect = fitz.Rect(0, inst.y0 - 10, page_rect.width, inst.y1 + 10)
page.add_redact_annot(ad_rect, fill=(1, 1, 1))
page.apply_redactions()
# 3. Save
doc.save(output_path, garbage=4, deflate=True)
doc.close()
print("Done! File saved.")
# --- EXECUTE ---
if os.path.exists(INPUT_FILENAME):
process_and_save_pdf(INPUT_FILENAME, OUTPUT_FILENAME)
else:
print(f"Error: Could not find '{INPUT_FILENAME}'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment