Skip to content

Instantly share code, notes, and snippets.

@Jaakkonen
Last active March 30, 2024 23:35
Show Gist options
  • Select an option

  • Save Jaakkonen/cadc8ff66be362d0731846df5db610a4 to your computer and use it in GitHub Desktop.

Select an option

Save Jaakkonen/cadc8ff66be362d0731846df5db610a4 to your computer and use it in GitHub Desktop.
Nordea Finnish machine format (html) bank statement parser - Nordea konekielisen tiliotteen (HTML) lukija Python
"""
Parser for Finnish Nordea html format bank statements.
File names are commonly in the name format:
Konekielinen-tiliote(YY-MM-DD).html
Yields categorized rows with deposited/withdrawn sums, reference numbers, messages, dates, etc. extracted with regexes.
Access the rows with the rows dictionary using the RowType enum as the key, and get the properties of the row with the groupdict() method of the match object.
Example:
>>> rows[RowType.Deposit][0].match.groupdict()
{'arkistotunnus': 'GSIPC000000000000',
'maksup': '01.02.',
'arvop': '01.02.',
'counterparty': 'MEIKÄLÄINEN MATTI',
'maksutyyppi': '710 Pano',
'viesti': 'Maksun viesti\nNOTPROVIDED\nOPF00000000.0',
'viite': None,
'viitetarkenne': None,
'tapahtumanro': '4',
'summa': '69,42+'}
"""
from typing import Any, Generator, NamedTuple
from bs4 import BeautifulSoup, ResultSet
from enum import Enum
from collections import defaultdict
import re
from decimal import Decimal
from datetime import datetime
with open("Konekielinen-tiliote(24-03-12).html") as f:
soup = BeautifulSoup(f, "html.parser")
def to_decimal(s: str) -> Decimal:
# If there's a - or + sign at the end, move it to the front
if s[-1] in "+-":
s = s[-1] + s[:-1]
# Remove thousands separator and replace comma with dot
s = s.replace(".", "").replace(",", ".")
return Decimal(s)
# Cells are separated by 📁 emoji
CELL_SEPARATOR = BOX = "📁"
# Text between cells is separated by ↩ emoji
TAG_SEPARATOR = CARD_INDEX = "↩"
class RowType(Enum):
NordeaHeader = r"^↩Nordea↩\n↩1544 NBC FI Customers📁TILIOTE↩Päivämäärä↩(?P<raporttipvm>[\d\.]{10})📁Sivu\s+(?P<page>\d+)↩Kausi↩(?P<rangestart>[\d\.]{10}) -\n\s+(?P<rangeend>[\d\.]{10})↩(?P<moyear>\d{3} / \d{4})$"
AccountHeader1 = r"^↩(?P<accountname>[A-Z\s-]+)📁(?P<accounttype>\w+)↩(?P<accountnumber>[\d-]+)📁Valuutta↩EUR$"
AccountHeader2 = r"^📁IBAN↩(?P<iban>FI[\d ]{20})📁SWIFT/BIC↩NDEAFIHH↩$"
TableHeader = r"^Arkistointitunnus↩Saajan tilinumero📁Maksup↩Arvop📁Saaja / Maksaja↩Viesti📁Tap.↩nro📁Määrä$"
BalanceRow = r"^📁📁(?P<pvm>[\d\.]{10}) Saldo📁📁(?P<saldo>[\d\.,\+-]+)$"
RegisteredDate = r"^Kirjauspäivä (?P<pvm>[\d\.]{6})📁📁📁📁$"
# These are breakdown rows for Nordea banking fees. The main row has the archiving number and the total sum.
# To avoid double counting, these should not be included in the final sums.
# Note that the only difference is that the "arvop" group is missing. Additionally the value has a class in the <td> tag in the HTML but it's not visible in the text.
NordeaPaymentBreakdown = r"J ↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<viesti>[^↩]+?)\s*↩(?P<aika>[^↩]+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}(-|\+))"
Withdrawal = r"(A |JE|J )?↩(?P<arkistotunnus>[A-Z\d]+)↩(\s(?P<viite>[\d-]+))?📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/(J|A)↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s+[^↩]+↩(?P<viesti>[^↩]+?)\s*↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}-)"
Deposit = r"(A )?↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s*↩((?P<viesti>[^↩]+?)\s*|Viite (?P<viite>[\d ]+?)\s+↩(?P<viitetarkenne>\w+\n\w+)\s+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}\+)"
SaldoRow = r"📁📁(?P<pvm>[\d\.]{10}) Saldo↩Käyttövara📁📁(?P<saldo>[\d\.,\+]+)↩(?P<kayttovara>[\d\.,\+]+)"
PeriodWithdrawalDeposit = r"^📁📁(?P<panot>\d+)↩(?P<otot>\d+)↩Panot(?P<period_deposit>(/kausi|/kk|/vv| yhteensä))↩Otot(?P<period_withdrawal>(/kausi|/kk|/vv| yhteensä))📁📁(?P<panot_summa>[\d\.,\+]+)↩(?P<otot_summa>([\d\.,-]+|0,00\+))$"
NewLine = r"^↩\xa0↩📁📁📁📁$"
regexes = {rowtype: re.compile(rowtype.value) for rowtype in RowType}
class Row(NamedTuple):
rowtype: RowType
text: str
cells: ResultSet
match: re.Match[str]
def extractrows(soup) -> Generator[Row, Any, None]:
"""
Yields rows from the table containing payment details as a list of cells.
"""
t = soup.findAll("table")
for table in t:
rows = table.findAll("tr")
for row in rows:
cells = row.findAll(["td", "th"])
if cells:
astext = "📁".join(cell.get_text("↩").strip() for cell in cells)
for rowtype in RowType:
if match := re.match(regexes[rowtype], astext):
yield Row(rowtype, astext, cells, match)
break
else:
raise ValueError(f"Unknown row: {astext}")
rows: dict[RowType, list[Row]] = defaultdict(list)
for row in extractrows(soup):
rows[row.rowtype].append(row)
for rowtype, rowdata in rows.items():
print("#################################")
print(rowtype)
print("#################################")
for row in rowdata:
print(row.match.groupdict())
print()
# def sanity_check():
# """
# Validates the data in the rows
# """
# # Validate the PeriodWithdrawalDeposit rows
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]:
# data = match.groupdict()
# assert (
# data["period_deposit"] == data["period_withdrawal"]
# ), "Period deposit and withdrawal should match"
# if data["period_deposit"] in {"/kk", "/vv"}:
# # The withdrawal and deposit counts are not counted for monthly and yearly periods
# # (Those are only counted for each "period" )
# assert data["panot"] == data["otot"] == "0"
# vv_total = next(
# (
# match.groupdict()
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
# if match.groupdict()["period_deposit"] == "/vv"
# ),
# None,
# )
# assert vv_total is not None, "No yearly total found"
# saldo_dates = {
# datetime.strptime(match.groupdict()["pvm"], "%d.%m.%Y"): to_decimal(
# match.groupdict()["saldo"]
# )
# for rowtype, text, cells, match in [
# *rows[RowType.BalanceRow],
# *rows[RowType.SaldoRow],
# ]
# }
# START_SALDO_DATE = min(saldo_dates.keys())
# START_SALDO = saldo_dates[START_SALDO_DATE]
# END_SALDO_DATE = max(saldo_dates.keys())
# END_SALDO = saldo_dates[END_SALDO_DATE]
# # Sum up the total deposits and withdrawals
# total_deposits = sum(
# to_decimal(match.groupdict()["summa"])
# for rowtype, text, cells, match in rows[RowType.Deposit]
# )
# total_withdrawals = sum(
# to_decimal(match.groupdict()["summa"])
# for rowtype, text, cells, match in rows[RowType.Withdrawal]
# )
# # Sum up "panot yhteensä" and "otot yhteensä" for the yearly period
# sum_panot = sum(
# to_decimal(match.groupdict()["panot_summa"])
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
# if match.groupdict()["period_deposit"] == " yhteensä"
# )
# sum_otot = sum(
# to_decimal(match.groupdict()["otot_summa"])
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
# if match.groupdict()["period_withdrawal"] == " yhteensä"
# )
# sum_panot_krt = sum(
# to_decimal(match.groupdict()["panot"])
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
# if match.groupdict()["period_deposit"] == " yhteensä"
# )
# sum_otot_krt = sum(
# to_decimal(match.groupdict()["otot"])
# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
# if match.groupdict()["period_withdrawal"] == " yhteensä"
# )
# withdrawals_by_month = defaultdict(list)
# for row in rows[RowType.Withdrawal]:
# data = row.match.groupdict()
# withdrawals_by_month[datetime.strptime(data["maksup"], "%d.%m.").month].append(
# row
# )
# assert (
# sum(len(v) for v in withdrawals_by_month.values())
# == len(rows[RowType.Withdrawal])
# == sum_otot_krt
# )
# assert len(rows[RowType.Deposit]) == sum_panot_krt
# assert sum_panot == to_decimal(vv_total["panot_summa"]) == total_deposits
# assert sum_otot == to_decimal(vv_total["otot_summa"]) == total_withdrawals
# assert START_SALDO + total_deposits + total_withdrawals == END_SALDO
# sanity_check()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment