Last active
September 4, 2025 22:09
-
-
Save Tblue/2cb078b2499ff7ffc7fed864b1f009d7 to your computer and use it in GitHub Desktop.
Extract embedded files from PDF files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| ############################################################################### | |
| # | |
| # Extract embedded files from PDF files. | |
| # | |
| ############################################################################### | |
| # | |
| # I recently got a strange PDF file from an insurance company. It looked like | |
| # an email, completely with attached PDF files... That I couldn't open without | |
| # installing the rather subpar Adobe Acrobat Reader (which only runs on Windows | |
| # and macOS, of course, but not on Linux). | |
| # | |
| # It turns out that these "attachments" are not actually PDF attachments as | |
| # such, but *embedded* PDF files. And my PDF reader of choice (MuPDF-based) | |
| # cannot handle those embedded files (it *can* apparently handle "real" PDF | |
| # attachments, though...). | |
| # | |
| # So here is a Python script that uses QPDF [1] to extract those embedded files | |
| # into separate files. | |
| # | |
| # Tested with Python 3.13 on Linux. | |
| # | |
| # Caveat: Doesn't handle encrypted PDF files. You'll need to decrypt them | |
| # before passing them to this script, e.g. by running: | |
| # | |
| # qpdf --decrypt --remove-restrictions --replace-input --password='PASS' FILE.pdf | |
| # | |
| # Enjoy. | |
| # | |
| # | |
| # [1] https://qpdf.sourceforge.io/ | |
| # | |
| ############################################################################### | |
| # | |
| # Copyright 2025 Tilman BLUMENBACH | |
| # | |
| # Redistribution and use in source and binary forms, with or without | |
| # modification, are permitted provided that the following conditions are met: | |
| # | |
| # 1. Redistributions of source code must retain the above copyright notice, | |
| # this list of conditions and the following disclaimer. | |
| # | |
| # 2. Redistributions in binary form must reproduce the above copyright notice, | |
| # this list of conditions and the following disclaimer in the documentation | |
| # and/or other materials provided with the distribution. | |
| # | |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” | |
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | |
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| # POSSIBILITY OF SUCH DAMAGE. | |
| # | |
| ############################################################################### | |
| import argparse | |
| import json | |
| import logging | |
| import os | |
| import pathlib | |
| import shutil | |
| import subprocess | |
| import sys | |
| import tempfile | |
| from pprint import pprint | |
| def get_argparser(): | |
| p = argparse.ArgumentParser( | |
| description='Extract embedded files from PDF files.', | |
| epilog='Embedded files are different from attachments, and this tool ' | |
| 'is about the former. The latter can be operated on using ' | |
| 'tools like qpdf(1).' | |
| ) | |
| p.add_argument( | |
| 'input_file', | |
| help='PDF input file.' | |
| ) | |
| p.add_argument( | |
| 'output_dir', | |
| help='Output directory, will be created if missing.' | |
| ) | |
| return p | |
| def run_qpdf(input_file, temp_dir): | |
| out_file = f'{temp_dir}/data.json' | |
| subprocess.run( | |
| [ | |
| 'qpdf', | |
| '--warning-exit-0', | |
| '--progress', | |
| '--json=2', | |
| '--json-stream-data=file', | |
| f'--json-stream-prefix={temp_dir}/stream', | |
| '--', input_file, out_file | |
| ], | |
| check=True | |
| ) | |
| with open(out_file, 'r', encoding='utf-8') as fh: | |
| return json.load(fh) | |
| def main(): | |
| myname = os.path.basename(sys.argv[0]) | |
| args = get_argparser().parse_args() | |
| logging.basicConfig(level='INFO') | |
| with tempfile.TemporaryDirectory(prefix=f'{myname}_') as tempdir: | |
| pdf_data = run_qpdf(args.input_file, tempdir) | |
| # First, we build the streamID-to-stream map: | |
| stream_to_file = {} | |
| streams = pdf_data['qpdf'][1] | |
| for stream_id, stream in streams.items(): | |
| if not stream_id.startswith('obj:'): | |
| continue | |
| sinfo = stream.get('value', {}) | |
| filename = sinfo.get('/F') | |
| file_stream_id = sinfo.get('/EF', {}).get('/F') | |
| if (filename and file_stream_id and | |
| sinfo.get('/Type') == '/Filespec'): | |
| stream_to_file[f'obj:{file_stream_id}'] = filename | |
| # Now we can pick out the actual files. | |
| for sid, filename in stream_to_file.items(): | |
| stream = streams.get(sid, {}).get('stream', {}) | |
| stream_datafile = stream.get('datafile') | |
| if not stream_datafile: | |
| logging.warning( | |
| 'File stream %s (for file %s) not found!', | |
| sid, filename | |
| ) | |
| # We only support QPDF's "u:"-prefixed strings at the moment, i.e. | |
| # Unicode encoding. | |
| if not filename.startswith('u:'): | |
| logging.warning( | |
| 'Cannot decode filename %r at the moment, skipping file.', | |
| filename | |
| ) | |
| continue | |
| filename = pathlib.PurePosixPath(filename[2:]).name | |
| logging.info('Got file: %s', filename) | |
| os.makedirs(args.output_dir, exist_ok=True) | |
| shutil.move(stream_datafile, f'{args.output_dir}/{filename}') | |
| if __name__ == '__main__': | |
| sys.exit(main()) | |
| # vim: tw=79 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment