Tblue · September 4, 2025 22:09
diff --git a/extract-pdf-embeds.py b/extract-pdf-embeds.py
 #!/usr/bin/env python3
 ###############################################################################
 #
 # Extract embedded files from PDF files.
 #
 ###############################################################################
 #
 # I recently got a strange PDF file from an insurance company. It looked like
 # an email, completely with attached PDF files... That I couldn't open without
 # installing the rather subpar Adobe Acrobat Reader (which only runs on Windows
 # and macOS, of course, but not on Linux).
 #
 # It turns out that these "attachments" are not actually PDF attachments as
 # such, but *embedded* PDF files. And my PDF reader of choice (MuPDF-based)
 # cannot handle those embedded files (it *can* apparently handle "real" PDF
 # attachments, though...).
 #
 # So here is a Python script that uses QPDF [1] to extract those embedded files
 # into separate files.
 #
 # Tested with Python 3.13 on Linux.
 #
 # Caveat: Doesn't handle encrypted PDF files. You'll need to decrypt them
 # before passing them to this script, e.g. by running:
 #
 #   qpdf --decrypt --remove-restrictions --replace-input --password='PASS' FILE.pdf
 #
 # Enjoy.
 #
 #
 # [1] https://qpdf.sourceforge.io/
 #
 ###############################################################################
 #
 # Copyright 2025 Tilman BLUMENBACH
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # 1. Redistributions of source code must retain the above copyright notice,
 #    this list of conditions and the following disclaimer.
 # 
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 #    this list of conditions and the following disclaimer in the documentation
 #    and/or other materials provided with the distribution.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 #
 ###############################################################################

 import argparse
 import json
 import logging
 import os
 import pathlib
 import shutil
 import subprocess
 import sys
 import tempfile
 from pprint import pprint


 def get_argparser():
    p = argparse.ArgumentParser(
        description='Extract embedded files from PDF files.',
        epilog='Embedded files are different from attachments, and this tool '
               'is about the former. The latter can be operated on using '
               'tools like qpdf(1).'
    )

    p.add_argument(
        'input_file',
        help='PDF input file.'
    )
    p.add_argument(
        'output_dir',
        help='Output directory, will be created if missing.'
    )

    return p


 def run_qpdf(input_file, temp_dir):
    out_file = f'{temp_dir}/data.json'

    subprocess.run(
        [
            'qpdf',
            '--warning-exit-0',
            '--progress',
            '--json=2',
            '--json-stream-data=file',
            f'--json-stream-prefix={temp_dir}/stream',
            '--', input_file, out_file
        ],
        check=True
    )

    with open(out_file, 'r', encoding='utf-8') as fh:
        return json.load(fh)


 def main():
    myname = os.path.basename(sys.argv[0])

    args = get_argparser().parse_args()
    logging.basicConfig(level='INFO')

    with tempfile.TemporaryDirectory(prefix=f'{myname}_') as tempdir:
        pdf_data = run_qpdf(args.input_file, tempdir)

        # First, we build the streamID-to-stream map:
        stream_to_file = {}
        streams = pdf_data['qpdf'][1]
        for stream_id, stream in streams.items():
            if not stream_id.startswith('obj:'):
                continue

            sinfo = stream.get('value', {})
            filename = sinfo.get('/F')
            file_stream_id = sinfo.get('/EF', {}).get('/F')

            if (filename and file_stream_id and
                    sinfo.get('/Type') == '/Filespec'):
                stream_to_file[f'obj:{file_stream_id}'] = filename

        # Now we can pick out the actual files.
        for sid, filename in stream_to_file.items():
            stream = streams.get(sid, {}).get('stream', {})
            stream_datafile = stream.get('datafile')

            if not stream_datafile:
                logging.warning(
                    'File stream %s (for file %s) not found!',
                    sid, filename
                )

            # We only support QPDF's "u:"-prefixed strings at the moment, i.e.
            # Unicode encoding.
            if not filename.startswith('u:'):
                logging.warning(
                    'Cannot decode filename %r at the moment, skipping file.',
                    filename
                )
                continue

            filename = pathlib.PurePosixPath(filename[2:]).name
            logging.info('Got file: %s', filename)

            os.makedirs(args.output_dir, exist_ok=True)
            shutil.move(stream_datafile, f'{args.output_dir}/{filename}')


 if __name__ == '__main__':
    sys.exit(main())

 # vim: tw=79
	#!/usr/bin/env python3
	###############################################################################
	#
	# Extract embedded files from PDF files.
	#
	###############################################################################
	#
	# I recently got a strange PDF file from an insurance company. It looked like
	# an email, completely with attached PDF files... That I couldn't open without
	# installing the rather subpar Adobe Acrobat Reader (which only runs on Windows
	# and macOS, of course, but not on Linux).
	#
	# It turns out that these "attachments" are not actually PDF attachments as
	# such, but embedded PDF files. And my PDF reader of choice (MuPDF-based)
	# cannot handle those embedded files (it can apparently handle "real" PDF
	# attachments, though...).
	#
	# So here is a Python script that uses QPDF [1] to extract those embedded files
	# into separate files.
	#
	# Tested with Python 3.13 on Linux.
	#
	# Caveat: Doesn't handle encrypted PDF files. You'll need to decrypt them
	# before passing them to this script, e.g. by running:
	#
	# qpdf --decrypt --remove-restrictions --replace-input --password='PASS' FILE.pdf
	#
	# Enjoy.
	#
	#
	# [1] https://qpdf.sourceforge.io/
	#
	###############################################################################
	#
	# Copyright 2025 Tilman BLUMENBACH
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# 1. Redistributions of source code must retain the above copyright notice,
	# this list of conditions and the following disclaimer.
	#
	# 2. Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS”
	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
	# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	# POSSIBILITY OF SUCH DAMAGE.
	#
	###############################################################################

	import argparse
	import json
	import logging
	import os
	import pathlib
	import shutil
	import subprocess
	import sys
	import tempfile
	from pprint import pprint


	def get_argparser():
	p = argparse.ArgumentParser(
	description='Extract embedded files from PDF files.',
	epilog='Embedded files are different from attachments, and this tool '
	'is about the former. The latter can be operated on using '
	'tools like qpdf(1).'
	)

	p.add_argument(
	'input_file',
	help='PDF input file.'
	)
	p.add_argument(
	'output_dir',
	help='Output directory, will be created if missing.'
	)

	return p


	def run_qpdf(input_file, temp_dir):
	out_file = f'{temp_dir}/data.json'

	subprocess.run(
	[
	'qpdf',
	'--warning-exit-0',
	'--progress',
	'--json=2',
	'--json-stream-data=file',
	f'--json-stream-prefix={temp_dir}/stream',
	'--', input_file, out_file
	],
	check=True
	)

	with open(out_file, 'r', encoding='utf-8') as fh:
	return json.load(fh)


	def main():
	myname = os.path.basename(sys.argv[0])

	args = get_argparser().parse_args()
	logging.basicConfig(level='INFO')

	with tempfile.TemporaryDirectory(prefix=f'{myname}_') as tempdir:
	pdf_data = run_qpdf(args.input_file, tempdir)

	# First, we build the streamID-to-stream map:
	stream_to_file = {}
	streams = pdf_data['qpdf'][1]
	for stream_id, stream in streams.items():
	if not stream_id.startswith('obj:'):
	continue

	sinfo = stream.get('value', {})
	filename = sinfo.get('/F')
	file_stream_id = sinfo.get('/EF', {}).get('/F')

	if (filename and file_stream_id and
	sinfo.get('/Type') == '/Filespec'):
	stream_to_file[f'obj:{file_stream_id}'] = filename

	# Now we can pick out the actual files.
	for sid, filename in stream_to_file.items():
	stream = streams.get(sid, {}).get('stream', {})
	stream_datafile = stream.get('datafile')

	if not stream_datafile:
	logging.warning(
	'File stream %s (for file %s) not found!',
	sid, filename
	)

	# We only support QPDF's "u:"-prefixed strings at the moment, i.e.
	# Unicode encoding.
	if not filename.startswith('u:'):
	logging.warning(
	'Cannot decode filename %r at the moment, skipping file.',
	filename
	)
	continue

	filename = pathlib.PurePosixPath(filename[2:]).name
	logging.info('Got file: %s', filename)

	os.makedirs(args.output_dir, exist_ok=True)
	shutil.move(stream_datafile, f'{args.output_dir}/{filename}')


	if __name__ == '__main__':
	sys.exit(main())

	# vim: tw=79
No results found