Skip to content

Instantly share code, notes, and snippets.

@brunoamaral
Last active October 31, 2025 20:03
Show Gist options
  • Select an option

  • Save brunoamaral/258e73a488a75c8fb60a16b99119933b to your computer and use it in GitHub Desktop.

Select an option

Save brunoamaral/258e73a488a75c8fb60a16b99119933b to your computer and use it in GitHub Desktop.
paperless-ngx, post consume script to fill in a custom field with the total for an invoice
#!/usr/bin/env python3
import os
import requests
import sys
import re
# Environment variables for Paperless API
PAPERLESS_API_URL = os.getenv("PAPERLESS_API_URL", "http://localhost:8000/api")
PAPERLESS_API_TOKEN = os.getenv("SCRIPT_PAPERLESS_API_TOKEN", None)
HEADERS = {"authorization": f"Basic {PAPERLESS_API_TOKEN}"}
def get_document(document_id):
"""
Retrieve the text content of a document from Paperless-ngx using its API.
"""
try:
# Construct API endpoint for the document
url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
# Send GET request to retrieve the OCR text
response = requests.get(url, headers=HEADERS)
# Check response status
if response.status_code == 200:
return response.json()
else:
print(f"Error: Unable to retrieve text for document {document_id}. Status code: {response.status_code}")
return None
except Exception as e:
print(f"Error: {e}")
return None
def extract_invoice_total(content, regex_patterns):
"""
Extract the total amount and currency from the content field using a list of regular expressions.
Args:
content (str): The invoice text content.
regex_patterns (list of str): List of regular expressions to match total amounts.
Returns:
tuple: A tuple containing the matched total amount (str) and currency (str) or (None, None) if no match is found.
"""
for pattern in regex_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
amount = match.group(1)
# Check for currency in the content
if "€" in content or "EUR" in content:
currency = "EUR"
elif "$" in content or "USD" in content:
currency = "USD"
else:
currency = "Unknown"
return amount, currency
return None, None
def format_total_value(total, currency="EUR"):
"""
Format the total amount to comply with Paperless-ngx requirements.
Args:
total (str): The extracted total value.
currency (str): The currency code to prepend (default is EUR).
Returns:
str: Formatted total value (e.g., EUR123.45).
"""
try:
# Normalize total (replace commas with dots and remove spaces)
total = total.replace(",", ".").strip()
# Ensure it's a valid float and format to two decimal places
total_float = float(total)
formatted_total = f"{currency}{total_float:.2f}"
return formatted_total
except ValueError:
print(f"Error: Unable to format total value '{total}'.")
return None
def update_document_custom_field(document_id, value):
"""
Update or add a document's custom field using the Paperless-ngx API.
Args:
document_id (int): The ID of the document to update.
value (str): The value to set for the custom field.
Returns:
bool: True if the update was successful, False otherwise.
"""
try:
# Get the current document details
url = f"{PAPERLESS_API_URL}/documents/{document_id}/"
response = requests.get(url, headers=HEADERS)
if response.status_code != 200:
print(f"Error: Unable to fetch document {document_id} for update. Status: {response.status_code}")
return False
document = response.json()
custom_fields = document.get("custom_fields", [])
# Check if field with pk=1 exists
field_exists = any(field.get("field") == 1 for field in custom_fields)
if field_exists:
# Update existing field
for field in custom_fields:
if field.get("field") == 1:
field["value"] = value
else:
# Add new field
custom_fields.append({"field": 1, "value": value})
# Construct the payload
payload = {"custom_fields": custom_fields}
# Send the PATCH request
response = requests.patch(url, headers=HEADERS, json=payload)
if response.status_code == 200:
print(f"Document {document_id} updated successfully with custom field.")
return True
else:
print(f"Failed to update document {document_id}. Status: {response.status_code}, Response: {response.text}")
return False
except Exception as e:
print(f"Error updating document {document_id}: {e}")
return False
# List of regex patterns for matching different invoice formats
patterns = [
r"total due\s+\$([\d.,]+)", # DigitalOcean (specific)
r"total\s+\$([\d.,]+)\s*(USD)?", # Cloudflare (optional "USD")
r"total da fatura\s+EUR?\s*([\d.,]+)", # BOLT
r"total a pagar\s+([\d.,]+)\s*€?", # Endesa/Continente
r"total a pagar\s+([\d.,]+)", # Generic for "total a pagar"
r"total\s+€\s*([\d.,]+)", # VENDUS invoices
r"total\s+([\d.,]+)" # Generic fallback
]
def main():
"""
Main entry point for the post-consumption script.
"""
document_id = sys.argv[1]
if document_id == None:
document_id = os.getenv('DOCUMENT_ID')
document = get_document(document_id)
if document['document_type'] == 2:
# print(f"Document ID: {document_id}")
# print(document['content'])
total, currency = extract_invoice_total(document['content'], patterns)
print(total)
if total != None:
# add try catch here
total = format_total_value(total, currency)
update_document_custom_field(document_id,total)
else:
print(f"Failed to retrieve text for document {document_id}.")
if __name__ == "__main__":
main()
@inaxsesable
Copy link

I am looking to implement this for myself. Do you mind sharing how to set it up in the latest paperless. I've gone through the paperless script documentation but it doesn't go into enough detail for me to follow what is required.

@brunoamaral
Copy link
Author

I am afraid I don't have a full write-up.

The short version is that I configured this as a post-consumption script.

If you want to use this, you'll need to edit the patterns on line 132 to match your needs.

My full setup has a main script (pipeline.py) that runs my own pipeline from a directory. And since I run this using a container, I need to keep a requirements.txt and run docker exec paperless pip install -r ../scripts/requirements.txt every time there is an update.

image

Does this help? And thank you for reaching out. It feels nice to know that something I made is helping someone else. 😁

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment