rndparr · January 29, 2026 19:27
diff --git a/out.tex b/out.tex
 \documentclass{article}

 % load packages
 \usepackage{geometry}
 \usepackage{pdfpages}

 \usepackage[
  pdfpagelabels=true,
  pdftitle={Book Title}, % add the book title
  pdfauthor={Book Author}, % add the book author name
  unicode=true,
 ]{hyperref}
 \usepackage{bookmark}

 % begin document
 \begin{document}

 % page number settings
 \pagenumbering{arabic}
 \setcounter{page}{1}

 % CHANGE YOUR FILE PATH HERE
 \includepdf[pages=1-]{input.pdf}

 % BOOKMARKS
 % paste output of python file here; maybe add some other bookmarks (forward, bibliography, index, etc.) manually



 \end{document}
diff --git a/pdf_toc_to_latex_bookmarks.py b/pdf_toc_to_latex_bookmarks.py
 # path to pdf
 filepath = 'input.pdf'

 # page number (starting at 1) where table of content begins and ends
 toc_strt = 10
 toc_end = 16

 # page number (starting at 1) where arabic numbering actually starts (ie, not preamble, forwards, TOCs, sections with no page number/roman numerals)
 pg_num_strt = 20

 # the starting page number for first numbered page (page pg_num_strt) (ie, the number on page pg_num_strt)
 pg_num_strt_num = 3


 #######################################
 # fitz is PyMuPDF for reasons
 import fitz, re

 doc = fitz.open(filepath)

 all_lines = []

 # only read in toc pages
 for page in doc.pages(toc_strt - 1, toc_end):
 		# straight up all of the lines you'd care about for the TOC are in the same block, other blocks are for headers, footers, title, etc.
 		# the one you want is the only one that starts with a digit
 		page_lines = list(zip(*page.get_text('blocks')))[4]
 		all_lines += [line.split('\n') for line in page_lines if line[0].isdigit()]

 # flatten
 all_lines = [j for i in all_lines for j in i]

 # remove empty strings
 all_lines = list(filter(None, all_lines))

 # really long contents strings are split between lines
 # if a contents string doesnt end with a page number assume it needs to be combined with the next string in the list

 # ensure each line ends with page number
 def join_while_not_end_in_num(it):
 		it = iter(it)
 		while True:
 				try: 
 					current = next(it)
 				except StopIteration:
 					return
 				while not current[len(current)-1].isdigit():
 						current += ' ' + next(it)
 				yield current

 new_list = [x for x in join_while_not_end_in_num(all_lines)]

 # fix any chapter numbers on separate line before chap title
 def join_while_not_start_in_chapnum(it):
 	it = iter(it)
 	while True:
 		try: 
 			current = next(it)
 		except StopIteration:
 			return
 		while current[0].isalpha():
 				current = next(it) + ' ' + current
 		yield current

 new_list = list(reversed([i for i in join_while_not_start_in_chapnum(reversed(new_list))]))


 # add space between chapter num and title, remove any excess spaces 
 new_list = [re.sub(' +', ' ', re.sub(r'(\d[\.\d]*)', r'\1 ', i)) for i in new_list]

 # remave repeats of ' . ', split strings on remaining ' . ' if there's a period in the string or ' ' if there's not (ie a chapter string, which don't have ' . ')

 # this gets a list of tuples: ('contents', 'page_num')
 new_list = [y.split('. ') if '.' in y else y.rsplit(' ', 1) for y in [re.sub(r"(. +?)\1+", r"\1", x).strip() for x in new_list]]

 # get two new lists
 chapter, old_page = zip(*new_list)

 # fix whitespace
 chapter = [i.strip() for i in chapter]

 # update page based on PDF pages, start of numbering
 page = [str(int(x)+ (pg_num_strt - pg_num_strt_num) ) for x in old_page]

 # get the level by the number of '.'s in the digits at the start of the contents strings
 # d is a chapter, level 0
 # d.d is level 1
 # d.d.d is level 2
 # etc.
 level = [str(''.join(re.findall(r'[0-9]*\.', x)).count('.')) for x in chapter]

 # tex code
 bkmrk_strings = ['\\bookmark[page=' + page[i] + ',level='+ level[i] + ']{'+ chapter[i] +'}' for i in range(len(page))]

 # literally just copied and pasted the output to out.tex
 for string in bkmrk_strings:
 		print(string)
	\documentclass{article}

	% load packages
	\usepackage{geometry}
	\usepackage{pdfpages}

	\usepackage[
	pdfpagelabels=true,
	pdftitle={Book Title}, % add the book title
	pdfauthor={Book Author}, % add the book author name
	unicode=true,
	]{hyperref}
	\usepackage{bookmark}

	% begin document
	\begin{document}

	% page number settings
	\pagenumbering{arabic}
	\setcounter{page}{1}

	% CHANGE YOUR FILE PATH HERE
	\includepdf[pages=1-]{input.pdf}

	% BOOKMARKS
	% paste output of python file here; maybe add some other bookmarks (forward, bibliography, index, etc.) manually



	\end{document}
	# path to pdf
	filepath = 'input.pdf'

	# page number (starting at 1) where table of content begins and ends
	toc_strt = 10
	toc_end = 16

	# page number (starting at 1) where arabic numbering actually starts (ie, not preamble, forwards, TOCs, sections with no page number/roman numerals)
	pg_num_strt = 20

	# the starting page number for first numbered page (page pg_num_strt) (ie, the number on page pg_num_strt)
	pg_num_strt_num = 3


	#######################################
	# fitz is PyMuPDF for reasons
	import fitz, re

	doc = fitz.open(filepath)

	all_lines = []

	# only read in toc pages
	for page in doc.pages(toc_strt - 1, toc_end):
	# straight up all of the lines you'd care about for the TOC are in the same block, other blocks are for headers, footers, title, etc.
	# the one you want is the only one that starts with a digit
	page_lines = list(zip(*page.get_text('blocks')))[4]
	all_lines += [line.split('\n') for line in page_lines if line[0].isdigit()]

	# flatten
	all_lines = [j for i in all_lines for j in i]

	# remove empty strings
	all_lines = list(filter(None, all_lines))

	# really long contents strings are split between lines
	# if a contents string doesnt end with a page number assume it needs to be combined with the next string in the list

	# ensure each line ends with page number
	def join_while_not_end_in_num(it):
	it = iter(it)
	while True:
	try:
	current = next(it)
	except StopIteration:
	return
	while not current[len(current)-1].isdigit():
	current += ' ' + next(it)
	yield current

	new_list = [x for x in join_while_not_end_in_num(all_lines)]

	# fix any chapter numbers on separate line before chap title
	def join_while_not_start_in_chapnum(it):
	it = iter(it)
	while True:
	try:
	current = next(it)
	except StopIteration:
	return
	while current[0].isalpha():
	current = next(it) + ' ' + current
	yield current

	new_list = list(reversed([i for i in join_while_not_start_in_chapnum(reversed(new_list))]))


	# add space between chapter num and title, remove any excess spaces
	new_list = [re.sub(' +', ' ', re.sub(r'(\d[\.\d]*)', r'\1 ', i)) for i in new_list]

	# remave repeats of ' . ', split strings on remaining ' . ' if there's a period in the string or ' ' if there's not (ie a chapter string, which don't have ' . ')

	# this gets a list of tuples: ('contents', 'page_num')
	new_list = [y.split('. ') if '.' in y else y.rsplit(' ', 1) for y in [re.sub(r"(. +?)\1+", r"\1", x).strip() for x in new_list]]

	# get two new lists
	chapter, old_page = zip(*new_list)

	# fix whitespace
	chapter = [i.strip() for i in chapter]

	# update page based on PDF pages, start of numbering
	page = [str(int(x)+ (pg_num_strt - pg_num_strt_num) ) for x in old_page]

	# get the level by the number of '.'s in the digits at the start of the contents strings
	# d is a chapter, level 0
	# d.d is level 1
	# d.d.d is level 2
	# etc.
	level = [str(''.join(re.findall(r'[0-9]*\.', x)).count('.')) for x in chapter]

	# tex code
	bkmrk_strings = ['\\bookmark[page=' + page[i] + ',level='+ level[i] + ']{'+ chapter[i] +'}' for i in range(len(page))]

	# literally just copied and pasted the output to out.tex
	for string in bkmrk_strings:
	print(string)