Skip to content

Instantly share code, notes, and snippets.

@aleien
Created August 17, 2025 10:48
Show Gist options
  • Select an option

  • Save aleien/782b677040087a39da54f9407d1a90c0 to your computer and use it in GitHub Desktop.

Select an option

Save aleien/782b677040087a39da54f9407d1a90c0 to your computer and use it in GitHub Desktop.
Extract operations from T-bank account statement (2025), python3
import sys
import subprocess
try:
import pdfplumber
import csv
import re
except ImportError:
print("Устанавливаем необходимые библиотеки...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "pdfplumber"])
import pdfplumber
import csv
import re
from os import listdir
from os.path import isfile, join
pdf_dir = "pdfs"
csv_path = "output.csv"
rows = []
pdfs = [join(pdf_dir, f) for f in listdir(pdf_dir) if isfile(join(pdf_dir, f))]
date_pattern = re.compile(r"\d{2}\.\d{2}\.\d{4}")
amount_pattern = re.compile(r"[-+][\d\s]+\.\d{2}\s₽")
time_pattern = re.compile(r"\d{2}:\d{2}")
entries = []
current_entry = []
for pdf_path in pdfs:
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
lines = [line.strip() for line in page.extract_text().split('\n') if line.strip()]
for line in lines:
if line.startswith("Пополнения:") or line.startswith("АО «ТБанк» универсальная лицензия Банка России"):
break
line = re.sub(time_pattern, "", line)
# очищаем суммы
date = re.findall(date_pattern, line)
raw_amount = re.findall(amount_pattern, line)[0] if re.findall(amount_pattern, line) else ""
amount = ""
if raw_amount:
amount = re.sub(r"[^+\-\d.]", "", raw_amount)
while True:
line = re.sub(amount_pattern, "", line)
if not re.match(amount_pattern, line):
break
if re.match(date_pattern, line[:10]): # начало новой записи
while True:
line = re.sub(date_pattern, "", line)
if not re.match(date_pattern, line):
break
if current_entry:
entries.append(current_entry)
current_entry = [date[0] + " " + amount + " " + line]
else:
current_entry.append(line)
if current_entry:
entries.append(current_entry)
current_entry = []
# Парсинг записей в структурированный формат
print(pdf_path + f" {len(entries)}")
for entry in entries:
if not entry:
continue
first_line_parts = entry[0].split()
if not date_pattern.match(first_line_parts[0]):
continue # пропускаем странные строки
try:
date_op = f"{first_line_parts[0]}"
amount_op = f"{first_line_parts[1]}"
card_number = first_line_parts[-1]
description_parts = first_line_parts[2:-1]
for line in entry[1:]:
if line:
description_parts.append(line.strip())
date_op_full = f"{date_op}".strip()
description = " ".join(description_parts).strip()
rows.append([
date_op_full,
amount_op,
description,
card_number
])
except Exception as e:
print(f"Ошибка в разборе записи: {entry}\n{e}")
entries = []
# Сохраняем в CSV
with open(csv_path, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow([
"Дата операции",
"Сумма операции",
"Описание",
"Номер карты"
])
writer.writerows(rows)
print(f"✅ CSV сохранён в: {csv_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment