Skip to content

Instantly share code, notes, and snippets.

@Tblue
Last active September 4, 2025 22:09
Show Gist options
  • Select an option

  • Save Tblue/2cb078b2499ff7ffc7fed864b1f009d7 to your computer and use it in GitHub Desktop.

Select an option

Save Tblue/2cb078b2499ff7ffc7fed864b1f009d7 to your computer and use it in GitHub Desktop.
Extract embedded files from PDF files
#!/usr/bin/env python3
###############################################################################
#
# Extract embedded files from PDF files.
#
###############################################################################
#
# I recently got a strange PDF file from an insurance company. It looked like
# an email, completely with attached PDF files... That I couldn't open without
# installing the rather subpar Adobe Acrobat Reader (which only runs on Windows
# and macOS, of course, but not on Linux).
#
# It turns out that these "attachments" are not actually PDF attachments as
# such, but *embedded* PDF files. And my PDF reader of choice (MuPDF-based)
# cannot handle those embedded files (it *can* apparently handle "real" PDF
# attachments, though...).
#
# So here is a Python script that uses QPDF [1] to extract those embedded files
# into separate files.
#
# Tested with Python 3.13 on Linux.
#
# Caveat: Doesn't handle encrypted PDF files. You'll need to decrypt them
# before passing them to this script, e.g. by running:
#
# qpdf --decrypt --remove-restrictions --replace-input --password='PASS' FILE.pdf
#
# Enjoy.
#
#
# [1] https://qpdf.sourceforge.io/
#
###############################################################################
#
# Copyright 2025 Tilman BLUMENBACH
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
###############################################################################
import argparse
import json
import logging
import os
import pathlib
import shutil
import subprocess
import sys
import tempfile
from pprint import pprint
def get_argparser():
p = argparse.ArgumentParser(
description='Extract embedded files from PDF files.',
epilog='Embedded files are different from attachments, and this tool '
'is about the former. The latter can be operated on using '
'tools like qpdf(1).'
)
p.add_argument(
'input_file',
help='PDF input file.'
)
p.add_argument(
'output_dir',
help='Output directory, will be created if missing.'
)
return p
def run_qpdf(input_file, temp_dir):
out_file = f'{temp_dir}/data.json'
subprocess.run(
[
'qpdf',
'--warning-exit-0',
'--progress',
'--json=2',
'--json-stream-data=file',
f'--json-stream-prefix={temp_dir}/stream',
'--', input_file, out_file
],
check=True
)
with open(out_file, 'r', encoding='utf-8') as fh:
return json.load(fh)
def main():
myname = os.path.basename(sys.argv[0])
args = get_argparser().parse_args()
logging.basicConfig(level='INFO')
with tempfile.TemporaryDirectory(prefix=f'{myname}_') as tempdir:
pdf_data = run_qpdf(args.input_file, tempdir)
# First, we build the streamID-to-stream map:
stream_to_file = {}
streams = pdf_data['qpdf'][1]
for stream_id, stream in streams.items():
if not stream_id.startswith('obj:'):
continue
sinfo = stream.get('value', {})
filename = sinfo.get('/F')
file_stream_id = sinfo.get('/EF', {}).get('/F')
if (filename and file_stream_id and
sinfo.get('/Type') == '/Filespec'):
stream_to_file[f'obj:{file_stream_id}'] = filename
# Now we can pick out the actual files.
for sid, filename in stream_to_file.items():
stream = streams.get(sid, {}).get('stream', {})
stream_datafile = stream.get('datafile')
if not stream_datafile:
logging.warning(
'File stream %s (for file %s) not found!',
sid, filename
)
# We only support QPDF's "u:"-prefixed strings at the moment, i.e.
# Unicode encoding.
if not filename.startswith('u:'):
logging.warning(
'Cannot decode filename %r at the moment, skipping file.',
filename
)
continue
filename = pathlib.PurePosixPath(filename[2:]).name
logging.info('Got file: %s', filename)
os.makedirs(args.output_dir, exist_ok=True)
shutil.move(stream_datafile, f'{args.output_dir}/{filename}')
if __name__ == '__main__':
sys.exit(main())
# vim: tw=79
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment