Created
January 29, 2026 19:27
-
-
Save rndparr/d4793a59e2370a374320c8d32604637a to your computer and use it in GitHub Desktop.
LaTeX Bookmarks from PDF ToC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| \documentclass{article} | |
| % load packages | |
| \usepackage{geometry} | |
| \usepackage{pdfpages} | |
| \usepackage[ | |
| pdfpagelabels=true, | |
| pdftitle={Book Title}, % add the book title | |
| pdfauthor={Book Author}, % add the book author name | |
| unicode=true, | |
| ]{hyperref} | |
| \usepackage{bookmark} | |
| % begin document | |
| \begin{document} | |
| % page number settings | |
| \pagenumbering{arabic} | |
| \setcounter{page}{1} | |
| % CHANGE YOUR FILE PATH HERE | |
| \includepdf[pages=1-]{input.pdf} | |
| % BOOKMARKS | |
| % paste output of python file here; maybe add some other bookmarks (forward, bibliography, index, etc.) manually | |
| \end{document} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # path to pdf | |
| filepath = 'input.pdf' | |
| # page number (starting at 1) where table of content begins and ends | |
| toc_strt = 10 | |
| toc_end = 16 | |
| # page number (starting at 1) where arabic numbering actually starts (ie, not preamble, forwards, TOCs, sections with no page number/roman numerals) | |
| pg_num_strt = 20 | |
| # the starting page number for first numbered page (page pg_num_strt) (ie, the number on page pg_num_strt) | |
| pg_num_strt_num = 3 | |
| ####################################### | |
| # fitz is PyMuPDF for reasons | |
| import fitz, re | |
| doc = fitz.open(filepath) | |
| all_lines = [] | |
| # only read in toc pages | |
| for page in doc.pages(toc_strt - 1, toc_end): | |
| # straight up all of the lines you'd care about for the TOC are in the same block, other blocks are for headers, footers, title, etc. | |
| # the one you want is the only one that starts with a digit | |
| page_lines = list(zip(*page.get_text('blocks')))[4] | |
| all_lines += [line.split('\n') for line in page_lines if line[0].isdigit()] | |
| # flatten | |
| all_lines = [j for i in all_lines for j in i] | |
| # remove empty strings | |
| all_lines = list(filter(None, all_lines)) | |
| # really long contents strings are split between lines | |
| # if a contents string doesnt end with a page number assume it needs to be combined with the next string in the list | |
| # ensure each line ends with page number | |
| def join_while_not_end_in_num(it): | |
| it = iter(it) | |
| while True: | |
| try: | |
| current = next(it) | |
| except StopIteration: | |
| return | |
| while not current[len(current)-1].isdigit(): | |
| current += ' ' + next(it) | |
| yield current | |
| new_list = [x for x in join_while_not_end_in_num(all_lines)] | |
| # fix any chapter numbers on separate line before chap title | |
| def join_while_not_start_in_chapnum(it): | |
| it = iter(it) | |
| while True: | |
| try: | |
| current = next(it) | |
| except StopIteration: | |
| return | |
| while current[0].isalpha(): | |
| current = next(it) + ' ' + current | |
| yield current | |
| new_list = list(reversed([i for i in join_while_not_start_in_chapnum(reversed(new_list))])) | |
| # add space between chapter num and title, remove any excess spaces | |
| new_list = [re.sub(' +', ' ', re.sub(r'(\d[\.\d]*)', r'\1 ', i)) for i in new_list] | |
| # remave repeats of ' . ', split strings on remaining ' . ' if there's a period in the string or ' ' if there's not (ie a chapter string, which don't have ' . ') | |
| # this gets a list of tuples: ('contents', 'page_num') | |
| new_list = [y.split('. ') if '.' in y else y.rsplit(' ', 1) for y in [re.sub(r"(. +?)\1+", r"\1", x).strip() for x in new_list]] | |
| # get two new lists | |
| chapter, old_page = zip(*new_list) | |
| # fix whitespace | |
| chapter = [i.strip() for i in chapter] | |
| # update page based on PDF pages, start of numbering | |
| page = [str(int(x)+ (pg_num_strt - pg_num_strt_num) ) for x in old_page] | |
| # get the level by the number of '.'s in the digits at the start of the contents strings | |
| # d is a chapter, level 0 | |
| # d.d is level 1 | |
| # d.d.d is level 2 | |
| # etc. | |
| level = [str(''.join(re.findall(r'[0-9]*\.', x)).count('.')) for x in chapter] | |
| # tex code | |
| bkmrk_strings = ['\\bookmark[page=' + page[i] + ',level='+ level[i] + ']{'+ chapter[i] +'}' for i in range(len(page))] | |
| # literally just copied and pasted the output to out.tex | |
| for string in bkmrk_strings: | |
| print(string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment