Skip to content

Instantly share code, notes, and snippets.

@rndparr
Created January 29, 2026 19:27
Show Gist options
  • Select an option

  • Save rndparr/d4793a59e2370a374320c8d32604637a to your computer and use it in GitHub Desktop.

Select an option

Save rndparr/d4793a59e2370a374320c8d32604637a to your computer and use it in GitHub Desktop.
LaTeX Bookmarks from PDF ToC
\documentclass{article}
% load packages
\usepackage{geometry}
\usepackage{pdfpages}
\usepackage[
pdfpagelabels=true,
pdftitle={Book Title}, % add the book title
pdfauthor={Book Author}, % add the book author name
unicode=true,
]{hyperref}
\usepackage{bookmark}
% begin document
\begin{document}
% page number settings
\pagenumbering{arabic}
\setcounter{page}{1}
% CHANGE YOUR FILE PATH HERE
\includepdf[pages=1-]{input.pdf}
% BOOKMARKS
% paste output of python file here; maybe add some other bookmarks (forward, bibliography, index, etc.) manually
\end{document}
# path to pdf
filepath = 'input.pdf'
# page number (starting at 1) where table of content begins and ends
toc_strt = 10
toc_end = 16
# page number (starting at 1) where arabic numbering actually starts (ie, not preamble, forwards, TOCs, sections with no page number/roman numerals)
pg_num_strt = 20
# the starting page number for first numbered page (page pg_num_strt) (ie, the number on page pg_num_strt)
pg_num_strt_num = 3
#######################################
# fitz is PyMuPDF for reasons
import fitz, re
doc = fitz.open(filepath)
all_lines = []
# only read in toc pages
for page in doc.pages(toc_strt - 1, toc_end):
# straight up all of the lines you'd care about for the TOC are in the same block, other blocks are for headers, footers, title, etc.
# the one you want is the only one that starts with a digit
page_lines = list(zip(*page.get_text('blocks')))[4]
all_lines += [line.split('\n') for line in page_lines if line[0].isdigit()]
# flatten
all_lines = [j for i in all_lines for j in i]
# remove empty strings
all_lines = list(filter(None, all_lines))
# really long contents strings are split between lines
# if a contents string doesnt end with a page number assume it needs to be combined with the next string in the list
# ensure each line ends with page number
def join_while_not_end_in_num(it):
it = iter(it)
while True:
try:
current = next(it)
except StopIteration:
return
while not current[len(current)-1].isdigit():
current += ' ' + next(it)
yield current
new_list = [x for x in join_while_not_end_in_num(all_lines)]
# fix any chapter numbers on separate line before chap title
def join_while_not_start_in_chapnum(it):
it = iter(it)
while True:
try:
current = next(it)
except StopIteration:
return
while current[0].isalpha():
current = next(it) + ' ' + current
yield current
new_list = list(reversed([i for i in join_while_not_start_in_chapnum(reversed(new_list))]))
# add space between chapter num and title, remove any excess spaces
new_list = [re.sub(' +', ' ', re.sub(r'(\d[\.\d]*)', r'\1 ', i)) for i in new_list]
# remave repeats of ' . ', split strings on remaining ' . ' if there's a period in the string or ' ' if there's not (ie a chapter string, which don't have ' . ')
# this gets a list of tuples: ('contents', 'page_num')
new_list = [y.split('. ') if '.' in y else y.rsplit(' ', 1) for y in [re.sub(r"(. +?)\1+", r"\1", x).strip() for x in new_list]]
# get two new lists
chapter, old_page = zip(*new_list)
# fix whitespace
chapter = [i.strip() for i in chapter]
# update page based on PDF pages, start of numbering
page = [str(int(x)+ (pg_num_strt - pg_num_strt_num) ) for x in old_page]
# get the level by the number of '.'s in the digits at the start of the contents strings
# d is a chapter, level 0
# d.d is level 1
# d.d.d is level 2
# etc.
level = [str(''.join(re.findall(r'[0-9]*\.', x)).count('.')) for x in chapter]
# tex code
bkmrk_strings = ['\\bookmark[page=' + page[i] + ',level='+ level[i] + ']{'+ chapter[i] +'}' for i in range(len(page))]
# literally just copied and pasted the output to out.tex
for string in bkmrk_strings:
print(string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment