Jaakkonen · March 30, 2024 23:35
diff --git a/nordea_statement_parser.py b/nordea_statement_parser.py
 """
 Parser for Finnish Nordea html format bank statements.

 File names are commonly in the name format:
  Konekielinen-tiliote(YY-MM-DD).html

 Yields categorized rows with deposited/withdrawn sums, reference numbers, messages, dates, etc. extracted with regexes.
 Access the rows with the rows dictionary using the RowType enum as the key, and get the properties of the row with the groupdict() method of the match object.

 Example:
  >>> rows[RowType.Deposit][0].match.groupdict()
    {'arkistotunnus': 'GSIPC000000000000',
    'maksup': '01.02.',
    'arvop': '01.02.',
    'counterparty': 'MEIKÄLÄINEN MATTI',
    'maksutyyppi': '710 Pano',
    'viesti': 'Maksun viesti\nNOTPROVIDED\nOPF00000000.0',
    'viite': None,
    'viitetarkenne': None,
    'tapahtumanro': '4',
    'summa': '69,42+'}
 """

 from typing import Any, Generator, NamedTuple
 from bs4 import BeautifulSoup, ResultSet
 from enum import Enum
 from collections import defaultdict
 import re
 from decimal import Decimal
 from datetime import datetime


 with open("Konekielinen-tiliote(24-03-12).html") as f:
    soup = BeautifulSoup(f, "html.parser")


 def to_decimal(s: str) -> Decimal:
    # If there's a - or + sign at the end, move it to the front
    if s[-1] in "+-":
        s = s[-1] + s[:-1]
    # Remove thousands separator and replace comma with dot
    s = s.replace(".", "").replace(",", ".")
    return Decimal(s)


 # Cells are separated by 📁 emoji
 CELL_SEPARATOR = BOX = "📁"
 # Text between cells is separated by ↩ emoji
 TAG_SEPARATOR = CARD_INDEX = "↩"


 class RowType(Enum):
    NordeaHeader = r"^↩Nordea↩\n↩1544 NBC FI Customers📁TILIOTE↩Päivämäärä↩(?P<raporttipvm>[\d\.]{10})📁Sivu\s+(?P<page>\d+)↩Kausi↩(?P<rangestart>[\d\.]{10}) -\n\s+(?P<rangeend>[\d\.]{10})↩(?P<moyear>\d{3} / \d{4})$"
    AccountHeader1 = r"^↩(?P<accountname>[A-Z\s-]+)📁(?P<accounttype>\w+)↩(?P<accountnumber>[\d-]+)📁Valuutta↩EUR$"
    AccountHeader2 = r"^📁IBAN↩(?P<iban>FI[\d ]{20})📁SWIFT/BIC↩NDEAFIHH↩$"
    TableHeader = r"^Arkistointitunnus↩Saajan tilinumero📁Maksup↩Arvop📁Saaja / Maksaja↩Viesti📁Tap.↩nro📁Määrä$"
    BalanceRow = r"^📁📁(?P<pvm>[\d\.]{10}) Saldo📁📁(?P<saldo>[\d\.,\+-]+)$"
    RegisteredDate = r"^Kirjauspäivä (?P<pvm>[\d\.]{6})📁📁📁📁$"

    # These are breakdown rows for Nordea banking fees. The main row has the archiving number and the total sum.
    # To avoid double counting, these should not be included in the final sums.
    # Note that the only difference is that the "arvop" group is missing. Additionally the value has a class in the <td> tag in the HTML but it's not visible in the text.
    NordeaPaymentBreakdown = r"J ↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<viesti>[^↩]+?)\s*↩(?P<aika>[^↩]+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}(-|\+))"

    Withdrawal = r"(A |JE|J )?↩(?P<arkistotunnus>[A-Z\d]+)↩(\s(?P<viite>[\d-]+))?📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/(J|A)↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s+[^↩]+↩(?P<viesti>[^↩]+?)\s*↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}-)"
    Deposit = r"(A )?↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s*↩((?P<viesti>[^↩]+?)\s*|Viite (?P<viite>[\d ]+?)\s+↩(?P<viitetarkenne>\w+\n\w+)\s+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}\+)"
    SaldoRow = r"📁📁(?P<pvm>[\d\.]{10}) Saldo↩Käyttövara📁📁(?P<saldo>[\d\.,\+]+)↩(?P<kayttovara>[\d\.,\+]+)"
    PeriodWithdrawalDeposit = r"^📁📁(?P<panot>\d+)↩(?P<otot>\d+)↩Panot(?P<period_deposit>(/kausi|/kk|/vv| yhteensä))↩Otot(?P<period_withdrawal>(/kausi|/kk|/vv| yhteensä))📁📁(?P<panot_summa>[\d\.,\+]+)↩(?P<otot_summa>([\d\.,-]+|0,00\+))$"

    NewLine = r"^↩\xa0↩📁📁📁📁$"


 regexes = {rowtype: re.compile(rowtype.value) for rowtype in RowType}


 class Row(NamedTuple):
    rowtype: RowType
    text: str
    cells: ResultSet
    match: re.Match[str]


 def extractrows(soup) -> Generator[Row, Any, None]:
    """
    Yields rows from the table containing payment details as a list of cells.
    """
    t = soup.findAll("table")
    for table in t:
        rows = table.findAll("tr")
        for row in rows:
            cells = row.findAll(["td", "th"])
            if cells:
                astext = "📁".join(cell.get_text("↩").strip() for cell in cells)

                for rowtype in RowType:
                    if match := re.match(regexes[rowtype], astext):
                        yield Row(rowtype, astext, cells, match)
                        break
                else:
                    raise ValueError(f"Unknown row: {astext}")


 rows: dict[RowType, list[Row]] = defaultdict(list)
 for row in extractrows(soup):
    rows[row.rowtype].append(row)


 for rowtype, rowdata in rows.items():
    print("#################################")
    print(rowtype)
    print("#################################")
    for row in rowdata:
        print(row.match.groupdict())
    print()


 # def sanity_check():
 #     """
 #     Validates the data in the rows
 #     """
 #     # Validate the PeriodWithdrawalDeposit rows

 #     for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]:
 #         data = match.groupdict()
 #         assert (
 #             data["period_deposit"] == data["period_withdrawal"]
 #         ), "Period deposit and withdrawal should match"
 #         if data["period_deposit"] in {"/kk", "/vv"}:
 #             # The withdrawal and deposit counts are not counted for monthly and yearly periods
 #             # (Those are only counted for each "period" )
 #             assert data["panot"] == data["otot"] == "0"

 #     vv_total = next(
 #         (
 #             match.groupdict()
 #             for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
 #             if match.groupdict()["period_deposit"] == "/vv"
 #         ),
 #         None,
 #     )
 #     assert vv_total is not None, "No yearly total found"

 #     saldo_dates = {
 #         datetime.strptime(match.groupdict()["pvm"], "%d.%m.%Y"): to_decimal(
 #             match.groupdict()["saldo"]
 #         )
 #         for rowtype, text, cells, match in [
 #             *rows[RowType.BalanceRow],
 #             *rows[RowType.SaldoRow],
 #         ]
 #     }
 #     START_SALDO_DATE = min(saldo_dates.keys())
 #     START_SALDO = saldo_dates[START_SALDO_DATE]
 #     END_SALDO_DATE = max(saldo_dates.keys())
 #     END_SALDO = saldo_dates[END_SALDO_DATE]

 #     # Sum up the total deposits and withdrawals
 #     total_deposits = sum(
 #         to_decimal(match.groupdict()["summa"])
 #         for rowtype, text, cells, match in rows[RowType.Deposit]
 #     )
 #     total_withdrawals = sum(
 #         to_decimal(match.groupdict()["summa"])
 #         for rowtype, text, cells, match in rows[RowType.Withdrawal]
 #     )

 #     # Sum up "panot yhteensä" and "otot yhteensä" for the yearly period
 #     sum_panot = sum(
 #         to_decimal(match.groupdict()["panot_summa"])
 #         for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
 #         if match.groupdict()["period_deposit"] == " yhteensä"
 #     )
 #     sum_otot = sum(
 #         to_decimal(match.groupdict()["otot_summa"])
 #         for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
 #         if match.groupdict()["period_withdrawal"] == " yhteensä"
 #     )
 #     sum_panot_krt = sum(
 #         to_decimal(match.groupdict()["panot"])
 #         for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
 #         if match.groupdict()["period_deposit"] == " yhteensä"
 #     )
 #     sum_otot_krt = sum(
 #         to_decimal(match.groupdict()["otot"])
 #         for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
 #         if match.groupdict()["period_withdrawal"] == " yhteensä"
 #     )
 #     withdrawals_by_month = defaultdict(list)
 #     for row in rows[RowType.Withdrawal]:
 #         data = row.match.groupdict()
 #         withdrawals_by_month[datetime.strptime(data["maksup"], "%d.%m.").month].append(
 #             row
 #         )
 #     assert (
 #         sum(len(v) for v in withdrawals_by_month.values())
 #         == len(rows[RowType.Withdrawal])
 #         == sum_otot_krt
 #     )
 #     assert len(rows[RowType.Deposit]) == sum_panot_krt
 #     assert sum_panot == to_decimal(vv_total["panot_summa"]) == total_deposits
 #     assert sum_otot == to_decimal(vv_total["otot_summa"]) == total_withdrawals

 #     assert START_SALDO + total_deposits + total_withdrawals == END_SALDO

 # sanity_check()
	"""
	Parser for Finnish Nordea html format bank statements.

	File names are commonly in the name format:
	Konekielinen-tiliote(YY-MM-DD).html

	Yields categorized rows with deposited/withdrawn sums, reference numbers, messages, dates, etc. extracted with regexes.
	Access the rows with the rows dictionary using the RowType enum as the key, and get the properties of the row with the groupdict() method of the match object.

	Example:
	>>> rows[RowType.Deposit][0].match.groupdict()
	{'arkistotunnus': 'GSIPC000000000000',
	'maksup': '01.02.',
	'arvop': '01.02.',
	'counterparty': 'MEIKÄLÄINEN MATTI',
	'maksutyyppi': '710 Pano',
	'viesti': 'Maksun viesti\nNOTPROVIDED\nOPF00000000.0',
	'viite': None,
	'viitetarkenne': None,
	'tapahtumanro': '4',
	'summa': '69,42+'}
	"""

	from typing import Any, Generator, NamedTuple
	from bs4 import BeautifulSoup, ResultSet
	from enum import Enum
	from collections import defaultdict
	import re
	from decimal import Decimal
	from datetime import datetime


	with open("Konekielinen-tiliote(24-03-12).html") as f:
	soup = BeautifulSoup(f, "html.parser")


	def to_decimal(s: str) -> Decimal:
	# If there's a - or + sign at the end, move it to the front
	if s[-1] in "+-":
	s = s[-1] + s[:-1]
	# Remove thousands separator and replace comma with dot
	s = s.replace(".", "").replace(",", ".")
	return Decimal(s)


	# Cells are separated by 📁 emoji
	CELL_SEPARATOR = BOX = "📁"
	# Text between cells is separated by ↩ emoji
	TAG_SEPARATOR = CARD_INDEX = "↩"


	class RowType(Enum):
	NordeaHeader = r"^↩Nordea↩\n↩1544 NBC FI Customers📁TILIOTE↩Päivämäärä↩(?P<raporttipvm>[\d\.]{10})📁Sivu\s+(?P<page>\d+)↩Kausi↩(?P<rangestart>[\d\.]{10}) -\n\s+(?P<rangeend>[\d\.]{10})↩(?P<moyear>\d{3} / \d{4})$"
	AccountHeader1 = r"^↩(?P<accountname>[A-Z\s-]+)📁(?P<accounttype>\w+)↩(?P<accountnumber>[\d-]+)📁Valuutta↩EUR$"
	AccountHeader2 = r"^📁IBAN↩(?P<iban>FI[\d ]{20})📁SWIFT/BIC↩NDEAFIHH↩$"
	TableHeader = r"^Arkistointitunnus↩Saajan tilinumero📁Maksup↩Arvop📁Saaja / Maksaja↩Viesti📁Tap.↩nro📁Määrä$"
	BalanceRow = r"^📁📁(?P<pvm>[\d\.]{10}) Saldo📁📁(?P<saldo>[\d\.,\+-]+)$"
	RegisteredDate = r"^Kirjauspäivä (?P<pvm>[\d\.]{6})📁📁📁📁$"

	# These are breakdown rows for Nordea banking fees. The main row has the archiving number and the total sum.
	# To avoid double counting, these should not be included in the final sums.
	# Note that the only difference is that the "arvop" group is missing. Additionally the value has a class in the <td> tag in the HTML but it's not visible in the text.
	NordeaPaymentBreakdown = r"J ↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<viesti>[^↩]+?)\s*↩(?P<aika>[^↩]+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}(-\|\+))"

	Withdrawal = r"(A \|JE\|J )?↩(?P<arkistotunnus>[A-Z\d]+)↩(\s(?P<viite>[\d-]+))?📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/(J\|A)↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s+[^↩]+↩(?P<viesti>[^↩]+?)\s*↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}-)"
	Deposit = r"(A )?↩(?P<arkistotunnus>[A-Z\d]+)↩📁(?P<maksup>[\d\.]+)↩(?P<arvop>[\d\.]+)📁/J↩(?P<counterparty>[^↩]+)↩(?P<maksutyyppi>\d+ [\w-]+)\s↩((?P<viesti>[^↩]+?)\s\|Viite (?P<viite>[\d ]+?)\s+↩(?P<viitetarkenne>\w+\n\w+)\s+)↩📁(?P<tapahtumanro>\d+)📁(?P<summa>[\d\.]+,\d{2}\+)"
	SaldoRow = r"📁📁(?P<pvm>[\d\.]{10}) Saldo↩Käyttövara📁📁(?P<saldo>[\d\.,\+]+)↩(?P<kayttovara>[\d\.,\+]+)"
	PeriodWithdrawalDeposit = r"^📁📁(?P<panot>\d+)↩(?P<otot>\d+)↩Panot(?P<period_deposit>(/kausi\|/kk\|/vv\| yhteensä))↩Otot(?P<period_withdrawal>(/kausi\|/kk\|/vv\| yhteensä))📁📁(?P<panot_summa>[\d\.,\+]+)↩(?P<otot_summa>([\d\.,-]+\|0,00\+))$"

	NewLine = r"^↩\xa0↩📁📁📁📁$"


	regexes = {rowtype: re.compile(rowtype.value) for rowtype in RowType}


	class Row(NamedTuple):
	rowtype: RowType
	text: str
	cells: ResultSet
	match: re.Match[str]


	def extractrows(soup) -> Generator[Row, Any, None]:
	"""
	Yields rows from the table containing payment details as a list of cells.
	"""
	t = soup.findAll("table")
	for table in t:
	rows = table.findAll("tr")
	for row in rows:
	cells = row.findAll(["td", "th"])
	if cells:
	astext = "📁".join(cell.get_text("↩").strip() for cell in cells)

	for rowtype in RowType:
	if match := re.match(regexes[rowtype], astext):
	yield Row(rowtype, astext, cells, match)
	break
	else:
	raise ValueError(f"Unknown row: {astext}")


	rows: dict[RowType, list[Row]] = defaultdict(list)
	for row in extractrows(soup):
	rows[row.rowtype].append(row)


	for rowtype, rowdata in rows.items():
	print("#################################")
	print(rowtype)
	print("#################################")
	for row in rowdata:
	print(row.match.groupdict())
	print()


	# def sanity_check():
	# """
	# Validates the data in the rows
	# """
	# # Validate the PeriodWithdrawalDeposit rows

	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]:
	# data = match.groupdict()
	# assert (
	# data["period_deposit"] == data["period_withdrawal"]
	# ), "Period deposit and withdrawal should match"
	# if data["period_deposit"] in {"/kk", "/vv"}:
	# # The withdrawal and deposit counts are not counted for monthly and yearly periods
	# # (Those are only counted for each "period" )
	# assert data["panot"] == data["otot"] == "0"

	# vv_total = next(
	# (
	# match.groupdict()
	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
	# if match.groupdict()["period_deposit"] == "/vv"
	# ),
	# None,
	# )
	# assert vv_total is not None, "No yearly total found"

	# saldo_dates = {
	# datetime.strptime(match.groupdict()["pvm"], "%d.%m.%Y"): to_decimal(
	# match.groupdict()["saldo"]
	# )
	# for rowtype, text, cells, match in [
	# *rows[RowType.BalanceRow],
	# *rows[RowType.SaldoRow],
	# ]
	# }
	# START_SALDO_DATE = min(saldo_dates.keys())
	# START_SALDO = saldo_dates[START_SALDO_DATE]
	# END_SALDO_DATE = max(saldo_dates.keys())
	# END_SALDO = saldo_dates[END_SALDO_DATE]

	# # Sum up the total deposits and withdrawals
	# total_deposits = sum(
	# to_decimal(match.groupdict()["summa"])
	# for rowtype, text, cells, match in rows[RowType.Deposit]
	# )
	# total_withdrawals = sum(
	# to_decimal(match.groupdict()["summa"])
	# for rowtype, text, cells, match in rows[RowType.Withdrawal]
	# )

	# # Sum up "panot yhteensä" and "otot yhteensä" for the yearly period
	# sum_panot = sum(
	# to_decimal(match.groupdict()["panot_summa"])
	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
	# if match.groupdict()["period_deposit"] == " yhteensä"
	# )
	# sum_otot = sum(
	# to_decimal(match.groupdict()["otot_summa"])
	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
	# if match.groupdict()["period_withdrawal"] == " yhteensä"
	# )
	# sum_panot_krt = sum(
	# to_decimal(match.groupdict()["panot"])
	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
	# if match.groupdict()["period_deposit"] == " yhteensä"
	# )
	# sum_otot_krt = sum(
	# to_decimal(match.groupdict()["otot"])
	# for rowtype, text, cells, match in rows[RowType.PeriodWithdrawalDeposit]
	# if match.groupdict()["period_withdrawal"] == " yhteensä"
	# )
	# withdrawals_by_month = defaultdict(list)
	# for row in rows[RowType.Withdrawal]:
	# data = row.match.groupdict()
	# withdrawals_by_month[datetime.strptime(data["maksup"], "%d.%m.").month].append(
	# row
	# )
	# assert (
	# sum(len(v) for v in withdrawals_by_month.values())
	# == len(rows[RowType.Withdrawal])
	# == sum_otot_krt
	# )
	# assert len(rows[RowType.Deposit]) == sum_panot_krt
	# assert sum_panot == to_decimal(vv_total["panot_summa"]) == total_deposits
	# assert sum_otot == to_decimal(vv_total["otot_summa"]) == total_withdrawals

	# assert START_SALDO + total_deposits + total_withdrawals == END_SALDO

	# sanity_check()
No results found