Last active
August 3, 2025 18:03
-
-
Save cryptoluks/55b77b1985f11cb8bd5735117127264d to your computer and use it in GitHub Desktop.
Pre-consume hook for Paperless-ngx. This script checks if a PDF is password-protected, then tries a list of passwords from a dictionary file to unlock it. When successful, it saves the PDF with a deterministic ID so duplicate detection remains accurate.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import pikepdf | |
| def log(message): | |
| # Simple logging function; stdout and stderr are captured by Paperless-ngx | |
| print(message) | |
| def is_pdf_encrypted(file_path): | |
| """ | |
| Determine if a PDF is password-protected by trying to open it without a password. | |
| Returns True if the PDF is encrypted. | |
| """ | |
| try: | |
| with pikepdf.open(file_path) as pdf: | |
| return False # Opened successfully – not encrypted | |
| except pikepdf.PasswordError: | |
| return True # Raised a PasswordError – file is encrypted | |
| except Exception as e: | |
| log(f"Error during encryption check: {e}") | |
| return False | |
| def unlock_pdf(file_path): | |
| # Ensure the file exists | |
| if not os.path.exists(file_path): | |
| log(f"Error: File not found: {file_path}") | |
| return | |
| # Log available environment variables for debugging | |
| task_id = os.environ.get("TASK_ID", "N/A") | |
| source_path = os.environ.get("DOCUMENT_SOURCE_PATH", "Unknown") | |
| log(f"Processing TASK_ID: {task_id}") | |
| log(f"Source file: {source_path}") | |
| log(f"Working file: {file_path}") | |
| # Only process if the PDF is encrypted | |
| if not is_pdf_encrypted(file_path): | |
| log("PDF is not password protected. Skipping password removal.") | |
| return | |
| log("PDF is encrypted. Attempting password removal...") | |
| # Read password dictionary | |
| dictionary_path = "/usr/local/bin/scripts/passwords.txt" | |
| if not os.path.exists(dictionary_path): | |
| log("Error: Password dictionary file not found!") | |
| return | |
| with open(dictionary_path, "r") as f: | |
| passwords = [line.strip() for line in f if line.strip()] | |
| if not passwords: | |
| log("Error: Password dictionary is empty. Nothing to try.") | |
| return | |
| # Iterate through the passwords and attempt to unlock the PDF | |
| for pwd in passwords: | |
| try: | |
| with pikepdf.open( | |
| file_path, password=pwd, allow_overwriting_input=True | |
| ) as pdf: | |
| log(f"Success: Password '{pwd}' worked. Removing password...") | |
| pdf.save(file_path, deterministic_id=True) | |
| log("Password removal successful.") | |
| return | |
| except pikepdf.PasswordError: | |
| log(f"Password '{pwd}' did not work.") | |
| except Exception as e: | |
| log(f"An unexpected error occurred: {e}") | |
| return | |
| log("None of the provided passwords worked. PDF remains encrypted.") | |
| if __name__ == "__main__": | |
| # PAPERLESS_PRE_CONSUME_SCRIPT should only modify DOCUMENT_WORKING_PATH. | |
| file_path = os.environ.get("DOCUMENT_WORKING_PATH") | |
| if not file_path: | |
| log("Error: DOCUMENT_WORKING_PATH environment variable is not set.") | |
| sys.exit(1) | |
| unlock_pdf(file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Just a suggestion : Is it possible to apply a particular tag (say : Encrypted), if the script detects password protected file ? In this case, it will be possible to filter the files at later stage ?