|
#!/usr/bin/env python3 |
|
# -*- coding: utf-8 -*- |
|
import argparse, gzip, io, os, re, sys, fnmatch |
|
from collections import defaultdict |
|
|
|
# ---- I/O ---- |
|
def open_input(path: str): |
|
path = os.path.expanduser(path) |
|
if path.endswith(".gz"): |
|
return gzip.open(path, "rt", encoding="utf-8", errors="replace", newline="") |
|
return open(path, "rt", encoding="utf-8", errors="replace", newline="") |
|
|
|
def open_output(path: str): |
|
path = os.path.expanduser(path) |
|
d = os.path.dirname(path) or "." |
|
os.makedirs(d, exist_ok=True) |
|
if path.endswith(".gz"): |
|
raw = gzip.open(path, "wb") |
|
return io.TextIOWrapper(raw, encoding="utf-8", newline="") |
|
return open(path, "wt", encoding="utf-8", newline="") |
|
|
|
# ---- SQL tokenization helpers ---- |
|
TABLE_TOK = re.compile( |
|
r"""(?ix) |
|
\b(INSERT\s+INTO|CREATE\s+TABLE|DROP\s+TABLE|LOCK\s+TABLES|ALTER\s+TABLE| |
|
TRUNCATE\s+TABLE|REPLACE\s+INTO|DELETE\s+FROM)\s+([^\s(;]+) |
|
""" |
|
) |
|
COMMENT_TABLE_BACKTICK = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+`([^`]+)`""") |
|
COMMENT_TABLE_QUOTE = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+"([^"]+)"""") |
|
BANG_ALTER_KEYS = re.compile(r"""(?i)/\*![0-9]+\s+ALTER\s+TABLE\s+([^\s]+)\s+(?:DISABLE|ENABLE)\s+KEYS""") |
|
|
|
TYPE_MATCHERS = [ |
|
(re.compile(r'(?i)^\s*INSERT\s+INTO\s'), "INSERT"), |
|
(re.compile(r'(?i)^\s*CREATE\s+TABLE\s'), "CREATE"), |
|
(re.compile(r'(?i)^\s*ALTER\s+TABLE\s'), "ALTER"), |
|
(re.compile(r'(?i)^\s*DROP\s+TABLE\s'), "DROP"), |
|
(re.compile(r'(?i)^\s*TRUNCATE\s+TABLE\s'), "TRUNCATE"), |
|
(re.compile(r'(?i)^\s*REPLACE\s+INTO\s'), "REPLACE"), |
|
(re.compile(r'(?i)^\s*DELETE\s+FROM\s'), "DELETE"), |
|
(re.compile(r'(?i)^\s*LOCK\s+TABLES'), "LOCK"), |
|
] |
|
|
|
QUOTE_CHARS = {'"', "'"} |
|
|
|
def clean_table(token: str) -> str: |
|
t = token.strip().strip('`"') |
|
if '.' in t: |
|
t = t.split('.', 1)[1] |
|
return t.strip('`"') |
|
|
|
def grab_table(line: str) -> str: |
|
m = TABLE_TOK.search(line) |
|
if m: return clean_table(m.group(2)) |
|
m = COMMENT_TABLE_BACKTICK.match(line) |
|
if m: return clean_table(m.group(1)) |
|
m = COMMENT_TABLE_QUOTE.match(line) |
|
if m: return clean_table(m.group(1)) |
|
m = BANG_ALTER_KEYS.search(line) |
|
if m: return clean_table(m.group(1)) |
|
return "" |
|
|
|
def prefix_of(tbl: str) -> str: |
|
m = re.match(r'^[A-Za-z0-9$]+_', tbl) |
|
return m.group(0) if m else "NO_PREFIX" |
|
|
|
def type_of(line: str) -> str: |
|
for rx, name in TYPE_MATCHERS: |
|
if rx.search(line): return name |
|
return "OTHER" |
|
|
|
def statement_ends(line: str) -> bool: |
|
return line.rstrip().endswith(';') |
|
|
|
# ---- INSERT parsing ---- |
|
INSERT_HEAD_RE = re.compile( |
|
r'(?is)^\s*INSERT\s+INTO\s+(`?)(?P<table>[^\s(]+)\1\s*(?P<cols>\([^)]+\))?\s*VALUES\s*(?P<values>.+?)(?P<tail>\s+ON\s+DUPLICATE\s+KEY\s+UPDATE\s+.*)?;\s*$' |
|
) |
|
|
|
def split_values_tuples(values_blob: str): |
|
out, i, n = [], 0, len(values_blob) |
|
while i < n: |
|
while i < n and values_blob[i].isspace(): i += 1 |
|
if i < n and values_blob[i] == ',': i += 1; continue |
|
if i >= n: break |
|
if values_blob[i] != '(': return [values_blob.strip()] |
|
depth = 0; start = i; in_str = False; esc = False; quote = '' |
|
while i < n: |
|
ch = values_blob[i] |
|
if in_str: |
|
if esc: esc = False |
|
elif ch == '\\': esc = True |
|
elif ch == quote: in_str = False |
|
else: |
|
if ch in QUOTE_CHARS: in_str = True; quote = ch |
|
elif ch == '(': depth += 1 |
|
elif ch == ')': |
|
depth -= 1 |
|
if depth == 0: |
|
i += 1 |
|
out.append(values_blob[start:i].strip()) |
|
break |
|
i += 1 |
|
return out |
|
|
|
def parse_columns_list(cols: str): |
|
if not cols: return None |
|
inner = cols.strip()[1:-1] |
|
raw, cur, in_str, esc, quote = [], [], False, False, '' |
|
for ch in inner: |
|
if in_str: |
|
if esc: esc = False |
|
elif ch == '\\': esc = True |
|
elif ch == quote: in_str = False |
|
cur.append(ch) |
|
else: |
|
if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch) |
|
elif ch == '`': cur.append(ch) |
|
elif ch == ',': raw.append(''.join(cur).strip().strip('`"').lower()); cur = [] |
|
else: cur.append(ch) |
|
if cur: raw.append(''.join(cur).strip().strip('`"').lower()) |
|
return [c for c in (t.strip() for t in raw) if c] |
|
|
|
def values_tuple_to_fields(tuple_text: str): |
|
assert tuple_text.startswith('(') and tuple_text.endswith(')') |
|
inner = tuple_text[1:-1] |
|
fields, cur = [], [] |
|
in_str = False; esc = False; quote = ''; depth = 0 |
|
for ch in inner: |
|
if in_str: |
|
cur.append(ch) |
|
if esc: esc = False |
|
elif ch == '\\': esc = True |
|
elif ch == quote: in_str = False |
|
else: |
|
if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch) |
|
elif ch == '(': depth += 1; cur.append(ch) |
|
elif ch == ')': depth -= 1; cur.append(ch) |
|
elif ch == ',' and depth == 0: |
|
fields.append(''.join(cur).strip()); cur = [] |
|
else: cur.append(ch) |
|
if cur: fields.append(''.join(cur).strip()) |
|
return fields |
|
|
|
def sql_lit_equals(sql_literal: str, target: str) -> bool: |
|
sl = sql_literal.strip() |
|
if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS: |
|
return sl[1:-1] == target |
|
return False |
|
|
|
def sql_lit_startswith(sql_literal: str, prefix: str) -> bool: |
|
sl = sql_literal.strip() |
|
if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS: |
|
return sl[1:-1].startswith(prefix) |
|
return False |
|
|
|
def strip_quotes(s: str) -> str: |
|
s = s.strip() |
|
if len(s) >= 2 and s[0] == s[-1] and s[0] in QUOTE_CHARS: |
|
return s[1:-1] |
|
return s |
|
|
|
def reconstruct_insert(table: str, cols: str, tuple_chunks, tail: str) -> str: |
|
head = f"INSERT INTO `{table}`" |
|
if cols: head += f" {cols}" |
|
head += " VALUES " |
|
body = ",".join(tuple_chunks) |
|
return f"{head}{body}{tail or ''};\n" |
|
|
|
# ---- Modes ---- |
|
def mode_report(infile, keep_prefix: str, path_display: str, drop_patterns, verbose=False, progress_mb=0): |
|
tabs_by_pfx = defaultdict(int) |
|
total_tabs = 0 |
|
stmt_counts = defaultdict(int) |
|
samples = defaultdict(list) |
|
seen_tables = set() |
|
base_seen = defaultdict(lambda: defaultdict(bool)) |
|
base_pfx_count = defaultdict(int) |
|
bytes_read = 0 |
|
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0 |
|
|
|
def register_table(t: str, line: str): |
|
nonlocal total_tabs |
|
pfx = prefix_of(t) |
|
key = (pfx, t) |
|
if key not in seen_tables: |
|
seen_tables.add(key) |
|
tabs_by_pfx[pfx] += 1 |
|
total_tabs += 1 |
|
if len(samples[pfx]) < 15: samples[pfx].append(t) |
|
m = re.match(r'^[^_]+_(.*)$', t) |
|
if m: |
|
base = m.group(1) |
|
if not base_seen[base][pfx]: |
|
base_seen[base][pfx] = True |
|
base_pfx_count[base] += 1 |
|
st = type_of(line) |
|
stmt_counts[(pfx, st)] += 1 |
|
|
|
for line in infile: |
|
bytes_read += len(line) |
|
if next_mark and bytes_read >= next_mark: |
|
print(f"[report] {bytes_read//(1024*1024)} MB...", file=sys.stderr) |
|
next_mark += progress_mb * 1024 * 1024 |
|
t = grab_table(line) |
|
if not t: continue |
|
# apply drop-like |
|
if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue |
|
register_table(t, line) |
|
|
|
prefixes = sorted(tabs_by_pfx.keys()) |
|
kept_tabs = sum(tabs_by_pfx[p] for p in prefixes if p == keep_prefix) |
|
drop_tabs = sum(tabs_by_pfx[p] for p in prefixes if p != keep_prefix) |
|
total_kd = kept_tabs + drop_tabs |
|
kpct = (100.0 * kept_tabs / total_kd) if total_kd else 0.0 |
|
dpct = (100.0 * drop_tabs / total_kd) if total_kd else 0.0 |
|
|
|
print("=== SQL DUMP REPORT (DRY RUN) ===") |
|
print(f"Input file: {path_display}") |
|
print(f"Chosen KEEP prefix: {keep_prefix}") |
|
if drop_patterns: |
|
print(f"Extra drop-like patterns: {', '.join(drop_patterns)}") |
|
print(f"Distinct prefixes found: {len(prefixes)}") |
|
print(f"Distinct tables referenced: {total_tabs}\n") |
|
print(f"KEEP tables: {kept_tabs} ({kpct:.2f}%)") |
|
print(f"DROP tables: {drop_tabs} ({dpct:.2f}%)\n") |
|
|
|
typelist = ["CREATE","INSERT","ALTER","DROP","TRUNCATE","REPLACE","DELETE","LOCK","OTHER"] |
|
for p in prefixes: |
|
print(f"-- [{'KEEP' if p==keep_prefix else 'DROP'}] {p} --") |
|
print(f" Unique tables: {tabs_by_pfx[p]}") |
|
for ty in typelist: |
|
c = stmt_counts[(p, ty)] |
|
if c: print(f" {ty:<8}: {c}") |
|
if samples[p]: |
|
print(" Sample tables (up to 15):") |
|
for s in samples[p]: print(f" - {s}") |
|
if tabs_by_pfx[p] > len(samples[p]): |
|
print(f" ... (+{tabs_by_pfx[p]-len(samples[p])} more)") |
|
print() |
|
|
|
collisions = [b for b, n in base_pfx_count.items() if n > 1] |
|
if collisions: |
|
print(f"Cross-prefix collisions (same base table name across prefixes): {len(collisions)}") |
|
shown = 0 |
|
for b in collisions: |
|
pf = [p for p in prefixes if base_seen[b][p]] |
|
print(f" - {b} (prefixes: {', '.join(pf)})") |
|
shown += 1 |
|
if shown >= 15: |
|
print(" ... (+more)") |
|
break |
|
print() |
|
print("=== END REPORT ===") |
|
|
|
def mode_list(infile, keep_prefix: str, drop_patterns, progress_mb=0): |
|
tabs_by_pfx = defaultdict(list) |
|
seen = set() |
|
bytes_read = 0 |
|
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0 |
|
for line in infile: |
|
bytes_read += len(line) |
|
if next_mark and bytes_read >= next_mark: |
|
print(f"[list] {bytes_read//(1024*1024)} MB...", file=sys.stderr) |
|
next_mark += progress_mb * 1024 * 1024 |
|
t = grab_table(line) |
|
if not t: continue |
|
if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue |
|
pfx = prefix_of(t) |
|
key = (pfx, t) |
|
if key not in seen: |
|
seen.add(key) |
|
tabs_by_pfx[pfx].append(t) |
|
|
|
print("== Tables found ==") |
|
for p in sorted(tabs_by_pfx.keys()): |
|
head = f"-- Will KEEP ({keep_prefix}) --" if p == keep_prefix else f"-- Will DROP (prefix {p}) --" |
|
print(head) |
|
for t in tabs_by_pfx[p]: print(t) |
|
print() |
|
|
|
# ---- Filter with WP junk stripping ---- |
|
def filter_and_write(infile, outfile, keep_prefix: str, |
|
strip_revisions: bool, strip_autodrafts: bool, |
|
strip_trashed: bool, strip_oembed: bool, |
|
wp_prefix: str, drop_patterns, verbose=False, |
|
progress_mb=0): |
|
removed_post_ids = set() |
|
posts_table = f"{wp_prefix}posts" |
|
postmeta_table = f"{wp_prefix}postmeta" |
|
options_table = f"{wp_prefix}options" |
|
|
|
def table_is_dropped_by_pattern(tbl: str) -> bool: |
|
return any(fnmatch.fnmatch(tbl, pat) for pat in drop_patterns) |
|
|
|
def should_keep_table(tbl: str) -> bool: |
|
return tbl.startswith(keep_prefix) and not table_is_dropped_by_pattern(tbl) |
|
|
|
def process_insert_line(line: str) -> str: |
|
m = INSERT_HEAD_RE.match(line) |
|
if not m: return line |
|
table = clean_table(m.group('table')) |
|
cols = m.group('cols') |
|
vals = m.group('values') |
|
tail = m.group('tail') or "" |
|
|
|
if table not in (posts_table, postmeta_table, options_table): |
|
return line |
|
|
|
tuples = split_values_tuples(vals) |
|
if not tuples: return line |
|
col_list = parse_columns_list(cols) |
|
kept = [] |
|
|
|
if table == posts_table: |
|
try: |
|
if col_list: |
|
id_idx = col_list.index("id") |
|
pt_idx = col_list.index("post_type") |
|
ps_idx = col_list.index("post_status") |
|
else: |
|
id_idx = 0 |
|
pt_idx = None |
|
ps_idx = None |
|
dropped_here = 0 |
|
for tup in tuples: |
|
f = values_tuple_to_fields(tup) |
|
if not f: continue |
|
is_rev = is_ad = is_tr = False |
|
if strip_revisions: |
|
if pt_idx is not None and pt_idx < len(f): |
|
is_rev = sql_lit_equals(f[pt_idx], "revision") |
|
else: |
|
is_rev = any(sql_lit_equals(x, "revision") for x in f) |
|
if strip_autodrafts or strip_trashed: |
|
if ps_idx is not None and ps_idx < len(f): |
|
if strip_autodrafts: is_ad = sql_lit_equals(f[ps_idx], "auto-draft") |
|
if strip_trashed: is_tr = sql_lit_equals(f[ps_idx], "trash") |
|
else: |
|
if strip_autodrafts: is_ad = any(sql_lit_equals(x, "auto-draft") for x in f) |
|
if strip_trashed: is_tr = any(sql_lit_equals(x, "trash") for x in f) |
|
drop = is_rev or is_ad or is_tr |
|
if drop: |
|
if id_idx < len(f): |
|
removed_post_ids.add(strip_quotes(f[id_idx])) |
|
dropped_here += 1 |
|
continue |
|
kept.append(tup) |
|
if verbose and dropped_here: |
|
print(f"[posts] dropped {dropped_here} rows (rev/auto-draft/trash)", file=sys.stderr) |
|
except ValueError: |
|
return line |
|
|
|
elif table == postmeta_table: |
|
try: |
|
if col_list: |
|
pid_idx = col_list.index("post_id") |
|
mk_idx = col_list.index("meta_key") if strip_oembed else None |
|
else: |
|
pid_idx = 1 |
|
mk_idx = 2 if strip_oembed else None |
|
dropped_here = 0 |
|
for tup in tuples: |
|
f = values_tuple_to_fields(tup) |
|
if not f: continue |
|
post_id = strip_quotes(f[pid_idx]) if pid_idx < len(f) else None |
|
if post_id and post_id in removed_post_ids: |
|
dropped_here += 1 |
|
continue |
|
if strip_oembed and mk_idx is not None and mk_idx < len(f): |
|
mk = f[mk_idx] |
|
if sql_lit_startswith(mk, "_oembed_") or sql_lit_startswith(mk, "_oembed_time_"): |
|
dropped_here += 1 |
|
continue |
|
kept.append(tup) |
|
if verbose and dropped_here: |
|
print(f"[postmeta] dropped {dropped_here} rows (cascade/oEmbed)", file=sys.stderr) |
|
except ValueError: |
|
return line |
|
|
|
elif table == options_table and strip_oembed: |
|
try: |
|
if col_list: |
|
on_idx = col_list.index("option_name") |
|
else: |
|
on_idx = 1 |
|
dropped_here = 0 |
|
for tup in tuples: |
|
f = values_tuple_to_fields(tup) |
|
if not f: continue |
|
opt_name = f[on_idx] if on_idx < len(f) else "" |
|
if (sql_lit_startswith(opt_name, "_transient_oembed_") or |
|
sql_lit_startswith(opt_name, "_transient_timeout_oembed_")): |
|
dropped_here += 1 |
|
continue |
|
kept.append(tup) |
|
if verbose and dropped_here: |
|
print(f"[options] dropped {dropped_here} rows (oEmbed transients)", file=sys.stderr) |
|
except ValueError: |
|
return line |
|
|
|
if not kept: |
|
return "" |
|
return reconstruct_insert(table, cols if cols else "", kept, tail) |
|
|
|
bytes_read = 0 |
|
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0 |
|
buffer = [] |
|
in_block = False |
|
keep_block = False |
|
current_table = "" |
|
|
|
for raw in infile: |
|
line = raw |
|
bytes_read += len(line) |
|
if next_mark and bytes_read >= next_mark: |
|
print(f"[filter] {bytes_read//(1024*1024)} MB...", file=sys.stderr) |
|
next_mark += progress_mb * 1024 * 1024 |
|
|
|
t = grab_table(line) |
|
semicolon = statement_ends(line) |
|
if t and not in_block: |
|
in_block = True |
|
current_table = t |
|
keep_block = should_keep_table(t) |
|
if verbose and t.startswith(keep_prefix): |
|
if not keep_block: |
|
print(f"[drop-like] excluding table: {t}", file=sys.stderr) |
|
|
|
if keep_block: |
|
if line.lstrip().lower().startswith("insert into"): |
|
buffer.append(line) |
|
if semicolon: |
|
full = ''.join(buffer); buffer.clear() |
|
mod = process_insert_line(full) |
|
if mod: outfile.write(mod) |
|
else: |
|
if buffer: |
|
buffer.append(line) |
|
if semicolon: |
|
full = ''.join(buffer); buffer.clear() |
|
outfile.write(full) |
|
else: |
|
outfile.write(line) |
|
else: |
|
if buffer: |
|
buffer.append(line) |
|
if semicolon: buffer.clear() |
|
else: |
|
if line.lstrip().lower().startswith("insert into"): |
|
buffer.append(line) |
|
if semicolon: buffer.clear() |
|
|
|
if in_block and semicolon: |
|
in_block = False |
|
keep_block = False |
|
current_table = "" |
|
|
|
# ---- CLI ---- |
|
def main(): |
|
p = argparse.ArgumentParser( |
|
description="Filter MySQL dump by table prefix; list/report; strip WP junk; progress; drop-like patterns.", |
|
formatter_class=argparse.RawTextHelpFormatter |
|
) |
|
p.add_argument("-i", dest="infile", default=os.path.expanduser("~/dump.sql"), |
|
help="Input dump (.sql or .sql.gz). Default: ~/dump.sql") |
|
p.add_argument("-o", dest="outfile", default=os.path.expanduser("~/dump.filtered.sql"), |
|
help="Output file for filtered mode (.sql or .sql.gz). Default: ~/dump.filtered.sql") |
|
p.add_argument("-p", dest="prefix", default="7CHIc_", help="Table prefix to KEEP. Default: 7CHIc_") |
|
p.add_argument("-v", "--verbose", action="store_true", help="Verbose logs to stderr.") |
|
p.add_argument("--progress-every-mb", type=int, default=0, help="Print progress every N MB read.") |
|
|
|
p.add_argument("--list", dest="mode", action="store_const", const="list", help="List tables grouped by prefix.") |
|
p.add_argument("--report", dest="mode", action="store_const", const="report", help="Detailed dry-run report.") |
|
|
|
# WP cleanup toggles |
|
p.add_argument("--strip-revisions", action="store_true", help="Remove wp_posts revisions + related postmeta.") |
|
p.add_argument("--strip-auto-drafts", action="store_true", help="Remove wp_posts auto-drafts + related postmeta.") |
|
p.add_argument("--strip-trashed", action="store_true", help="Remove wp_posts with post_status='trash' + postmeta.") |
|
p.add_argument("--strip-oembed", action="store_true", help="Remove oEmbed caches (postmeta + options transients).") |
|
p.add_argument("--strip-wp-junk", action="store_true", |
|
help="Enable all: --strip-revisions --strip-auto-drafts --strip-trashed --strip-oembed") |
|
p.add_argument("--wp-prefix", dest="wp_prefix", default=None, |
|
help="WordPress table prefix for cleanup (defaults to -p).") |
|
|
|
# Extra: drop-like table patterns (repeatable) |
|
p.add_argument("--drop-like", action="append", default=[], metavar="PATTERN", |
|
help="Glob pattern to exclude tables (e.g. '*_aioseo_cache'). Repeatable.") |
|
|
|
args = p.parse_args() |
|
mode = args.mode or "filter" |
|
|
|
infile_path = os.path.expanduser(args.infile) |
|
outfile_path = os.path.expanduser(args.outfile) |
|
keep_prefix = args.prefix |
|
wp_prefix = args.wp_prefix or keep_prefix |
|
|
|
if not os.path.isfile(infile_path): |
|
print(f"Input not found: {infile_path}", file=sys.stderr) |
|
sys.exit(1) |
|
|
|
# Expand shorthand |
|
strip_revisions = args.strip_revisions or args.strip_wp_junk |
|
strip_autodrafts = args.strip_auto_drafts or args.strip_wp_junk |
|
strip_trashed = args.strip_trashed or args.strip_wp_junk |
|
strip_oembed = args.strip_oembed or args.strip_wp_junk |
|
|
|
drop_patterns = args.drop_like or [] |
|
|
|
with open_input(infile_path) as fin: |
|
if mode == "report": |
|
mode_report(fin, keep_prefix, infile_path, drop_patterns, verbose=args.verbose, progress_mb=args.progress_every_mb) |
|
elif mode == "list": |
|
mode_list(fin, keep_prefix, drop_patterns, progress_mb=args.progress_every_mb) |
|
else: |
|
with open_output(outfile_path) as fout: |
|
filter_and_write(fin, fout, keep_prefix, |
|
strip_revisions, strip_autodrafts, |
|
strip_trashed, strip_oembed, |
|
wp_prefix, drop_patterns, |
|
verbose=args.verbose, progress_mb=args.progress_every_mb) |
|
print(f"✅ Done. Wrote filtered dump to: {outfile_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |