Skip to content

Instantly share code, notes, and snippets.

@yashodhank
Last active September 29, 2025 19:20
Show Gist options
  • Select an option

  • Save yashodhank/4a5d891168d0b6175d47df31d388f74d to your computer and use it in GitHub Desktop.

Select an option

Save yashodhank/4a5d891168d0b6175d47df31d388f74d to your computer and use it in GitHub Desktop.
A streaming, zero-deps Python 3 tool to filter MySQL dumps by table prefix, with .sql / .sql.gz support, and optional WordPress cleanup: strip revisions, auto‑drafts, trashed posts, and oEmbed caches. Includes list/report modes, progress meter, verbose logging, and glob-based exclusions via --drop-like (e.g., *_aioseo_cache).
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse, gzip, io, os, re, sys, fnmatch
from collections import defaultdict
# ---- I/O ----
def open_input(path: str):
path = os.path.expanduser(path)
if path.endswith(".gz"):
return gzip.open(path, "rt", encoding="utf-8", errors="replace", newline="")
return open(path, "rt", encoding="utf-8", errors="replace", newline="")
def open_output(path: str):
path = os.path.expanduser(path)
d = os.path.dirname(path) or "."
os.makedirs(d, exist_ok=True)
if path.endswith(".gz"):
raw = gzip.open(path, "wb")
return io.TextIOWrapper(raw, encoding="utf-8", newline="")
return open(path, "wt", encoding="utf-8", newline="")
# ---- SQL tokenization helpers ----
TABLE_TOK = re.compile(
r"""(?ix)
\b(INSERT\s+INTO|CREATE\s+TABLE|DROP\s+TABLE|LOCK\s+TABLES|ALTER\s+TABLE|
TRUNCATE\s+TABLE|REPLACE\s+INTO|DELETE\s+FROM)\s+([^\s(;]+)
"""
)
COMMENT_TABLE_BACKTICK = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+`([^`]+)`""")
COMMENT_TABLE_QUOTE = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+"([^"]+)"""")
BANG_ALTER_KEYS = re.compile(r"""(?i)/\*![0-9]+\s+ALTER\s+TABLE\s+([^\s]+)\s+(?:DISABLE|ENABLE)\s+KEYS""")
TYPE_MATCHERS = [
(re.compile(r'(?i)^\s*INSERT\s+INTO\s'), "INSERT"),
(re.compile(r'(?i)^\s*CREATE\s+TABLE\s'), "CREATE"),
(re.compile(r'(?i)^\s*ALTER\s+TABLE\s'), "ALTER"),
(re.compile(r'(?i)^\s*DROP\s+TABLE\s'), "DROP"),
(re.compile(r'(?i)^\s*TRUNCATE\s+TABLE\s'), "TRUNCATE"),
(re.compile(r'(?i)^\s*REPLACE\s+INTO\s'), "REPLACE"),
(re.compile(r'(?i)^\s*DELETE\s+FROM\s'), "DELETE"),
(re.compile(r'(?i)^\s*LOCK\s+TABLES'), "LOCK"),
]
QUOTE_CHARS = {'"', "'"}
def clean_table(token: str) -> str:
t = token.strip().strip('`"')
if '.' in t:
t = t.split('.', 1)[1]
return t.strip('`"')
def grab_table(line: str) -> str:
m = TABLE_TOK.search(line)
if m: return clean_table(m.group(2))
m = COMMENT_TABLE_BACKTICK.match(line)
if m: return clean_table(m.group(1))
m = COMMENT_TABLE_QUOTE.match(line)
if m: return clean_table(m.group(1))
m = BANG_ALTER_KEYS.search(line)
if m: return clean_table(m.group(1))
return ""
def prefix_of(tbl: str) -> str:
m = re.match(r'^[A-Za-z0-9$]+_', tbl)
return m.group(0) if m else "NO_PREFIX"
def type_of(line: str) -> str:
for rx, name in TYPE_MATCHERS:
if rx.search(line): return name
return "OTHER"
def statement_ends(line: str) -> bool:
return line.rstrip().endswith(';')
# ---- INSERT parsing ----
INSERT_HEAD_RE = re.compile(
r'(?is)^\s*INSERT\s+INTO\s+(`?)(?P<table>[^\s(]+)\1\s*(?P<cols>\([^)]+\))?\s*VALUES\s*(?P<values>.+?)(?P<tail>\s+ON\s+DUPLICATE\s+KEY\s+UPDATE\s+.*)?;\s*$'
)
def split_values_tuples(values_blob: str):
out, i, n = [], 0, len(values_blob)
while i < n:
while i < n and values_blob[i].isspace(): i += 1
if i < n and values_blob[i] == ',': i += 1; continue
if i >= n: break
if values_blob[i] != '(': return [values_blob.strip()]
depth = 0; start = i; in_str = False; esc = False; quote = ''
while i < n:
ch = values_blob[i]
if in_str:
if esc: esc = False
elif ch == '\\': esc = True
elif ch == quote: in_str = False
else:
if ch in QUOTE_CHARS: in_str = True; quote = ch
elif ch == '(': depth += 1
elif ch == ')':
depth -= 1
if depth == 0:
i += 1
out.append(values_blob[start:i].strip())
break
i += 1
return out
def parse_columns_list(cols: str):
if not cols: return None
inner = cols.strip()[1:-1]
raw, cur, in_str, esc, quote = [], [], False, False, ''
for ch in inner:
if in_str:
if esc: esc = False
elif ch == '\\': esc = True
elif ch == quote: in_str = False
cur.append(ch)
else:
if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
elif ch == '`': cur.append(ch)
elif ch == ',': raw.append(''.join(cur).strip().strip('`"').lower()); cur = []
else: cur.append(ch)
if cur: raw.append(''.join(cur).strip().strip('`"').lower())
return [c for c in (t.strip() for t in raw) if c]
def values_tuple_to_fields(tuple_text: str):
assert tuple_text.startswith('(') and tuple_text.endswith(')')
inner = tuple_text[1:-1]
fields, cur = [], []
in_str = False; esc = False; quote = ''; depth = 0
for ch in inner:
if in_str:
cur.append(ch)
if esc: esc = False
elif ch == '\\': esc = True
elif ch == quote: in_str = False
else:
if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
elif ch == '(': depth += 1; cur.append(ch)
elif ch == ')': depth -= 1; cur.append(ch)
elif ch == ',' and depth == 0:
fields.append(''.join(cur).strip()); cur = []
else: cur.append(ch)
if cur: fields.append(''.join(cur).strip())
return fields
def sql_lit_equals(sql_literal: str, target: str) -> bool:
sl = sql_literal.strip()
if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
return sl[1:-1] == target
return False
def sql_lit_startswith(sql_literal: str, prefix: str) -> bool:
sl = sql_literal.strip()
if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
return sl[1:-1].startswith(prefix)
return False
def strip_quotes(s: str) -> str:
s = s.strip()
if len(s) >= 2 and s[0] == s[-1] and s[0] in QUOTE_CHARS:
return s[1:-1]
return s
def reconstruct_insert(table: str, cols: str, tuple_chunks, tail: str) -> str:
head = f"INSERT INTO `{table}`"
if cols: head += f" {cols}"
head += " VALUES "
body = ",".join(tuple_chunks)
return f"{head}{body}{tail or ''};\n"
# ---- Modes ----
def mode_report(infile, keep_prefix: str, path_display: str, drop_patterns, verbose=False, progress_mb=0):
tabs_by_pfx = defaultdict(int)
total_tabs = 0
stmt_counts = defaultdict(int)
samples = defaultdict(list)
seen_tables = set()
base_seen = defaultdict(lambda: defaultdict(bool))
base_pfx_count = defaultdict(int)
bytes_read = 0
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
def register_table(t: str, line: str):
nonlocal total_tabs
pfx = prefix_of(t)
key = (pfx, t)
if key not in seen_tables:
seen_tables.add(key)
tabs_by_pfx[pfx] += 1
total_tabs += 1
if len(samples[pfx]) < 15: samples[pfx].append(t)
m = re.match(r'^[^_]+_(.*)$', t)
if m:
base = m.group(1)
if not base_seen[base][pfx]:
base_seen[base][pfx] = True
base_pfx_count[base] += 1
st = type_of(line)
stmt_counts[(pfx, st)] += 1
for line in infile:
bytes_read += len(line)
if next_mark and bytes_read >= next_mark:
print(f"[report] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
next_mark += progress_mb * 1024 * 1024
t = grab_table(line)
if not t: continue
# apply drop-like
if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
register_table(t, line)
prefixes = sorted(tabs_by_pfx.keys())
kept_tabs = sum(tabs_by_pfx[p] for p in prefixes if p == keep_prefix)
drop_tabs = sum(tabs_by_pfx[p] for p in prefixes if p != keep_prefix)
total_kd = kept_tabs + drop_tabs
kpct = (100.0 * kept_tabs / total_kd) if total_kd else 0.0
dpct = (100.0 * drop_tabs / total_kd) if total_kd else 0.0
print("=== SQL DUMP REPORT (DRY RUN) ===")
print(f"Input file: {path_display}")
print(f"Chosen KEEP prefix: {keep_prefix}")
if drop_patterns:
print(f"Extra drop-like patterns: {', '.join(drop_patterns)}")
print(f"Distinct prefixes found: {len(prefixes)}")
print(f"Distinct tables referenced: {total_tabs}\n")
print(f"KEEP tables: {kept_tabs} ({kpct:.2f}%)")
print(f"DROP tables: {drop_tabs} ({dpct:.2f}%)\n")
typelist = ["CREATE","INSERT","ALTER","DROP","TRUNCATE","REPLACE","DELETE","LOCK","OTHER"]
for p in prefixes:
print(f"-- [{'KEEP' if p==keep_prefix else 'DROP'}] {p} --")
print(f" Unique tables: {tabs_by_pfx[p]}")
for ty in typelist:
c = stmt_counts[(p, ty)]
if c: print(f" {ty:<8}: {c}")
if samples[p]:
print(" Sample tables (up to 15):")
for s in samples[p]: print(f" - {s}")
if tabs_by_pfx[p] > len(samples[p]):
print(f" ... (+{tabs_by_pfx[p]-len(samples[p])} more)")
print()
collisions = [b for b, n in base_pfx_count.items() if n > 1]
if collisions:
print(f"Cross-prefix collisions (same base table name across prefixes): {len(collisions)}")
shown = 0
for b in collisions:
pf = [p for p in prefixes if base_seen[b][p]]
print(f" - {b} (prefixes: {', '.join(pf)})")
shown += 1
if shown >= 15:
print(" ... (+more)")
break
print()
print("=== END REPORT ===")
def mode_list(infile, keep_prefix: str, drop_patterns, progress_mb=0):
tabs_by_pfx = defaultdict(list)
seen = set()
bytes_read = 0
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
for line in infile:
bytes_read += len(line)
if next_mark and bytes_read >= next_mark:
print(f"[list] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
next_mark += progress_mb * 1024 * 1024
t = grab_table(line)
if not t: continue
if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
pfx = prefix_of(t)
key = (pfx, t)
if key not in seen:
seen.add(key)
tabs_by_pfx[pfx].append(t)
print("== Tables found ==")
for p in sorted(tabs_by_pfx.keys()):
head = f"-- Will KEEP ({keep_prefix}) --" if p == keep_prefix else f"-- Will DROP (prefix {p}) --"
print(head)
for t in tabs_by_pfx[p]: print(t)
print()
# ---- Filter with WP junk stripping ----
def filter_and_write(infile, outfile, keep_prefix: str,
strip_revisions: bool, strip_autodrafts: bool,
strip_trashed: bool, strip_oembed: bool,
wp_prefix: str, drop_patterns, verbose=False,
progress_mb=0):
removed_post_ids = set()
posts_table = f"{wp_prefix}posts"
postmeta_table = f"{wp_prefix}postmeta"
options_table = f"{wp_prefix}options"
def table_is_dropped_by_pattern(tbl: str) -> bool:
return any(fnmatch.fnmatch(tbl, pat) for pat in drop_patterns)
def should_keep_table(tbl: str) -> bool:
return tbl.startswith(keep_prefix) and not table_is_dropped_by_pattern(tbl)
def process_insert_line(line: str) -> str:
m = INSERT_HEAD_RE.match(line)
if not m: return line
table = clean_table(m.group('table'))
cols = m.group('cols')
vals = m.group('values')
tail = m.group('tail') or ""
if table not in (posts_table, postmeta_table, options_table):
return line
tuples = split_values_tuples(vals)
if not tuples: return line
col_list = parse_columns_list(cols)
kept = []
if table == posts_table:
try:
if col_list:
id_idx = col_list.index("id")
pt_idx = col_list.index("post_type")
ps_idx = col_list.index("post_status")
else:
id_idx = 0
pt_idx = None
ps_idx = None
dropped_here = 0
for tup in tuples:
f = values_tuple_to_fields(tup)
if not f: continue
is_rev = is_ad = is_tr = False
if strip_revisions:
if pt_idx is not None and pt_idx < len(f):
is_rev = sql_lit_equals(f[pt_idx], "revision")
else:
is_rev = any(sql_lit_equals(x, "revision") for x in f)
if strip_autodrafts or strip_trashed:
if ps_idx is not None and ps_idx < len(f):
if strip_autodrafts: is_ad = sql_lit_equals(f[ps_idx], "auto-draft")
if strip_trashed: is_tr = sql_lit_equals(f[ps_idx], "trash")
else:
if strip_autodrafts: is_ad = any(sql_lit_equals(x, "auto-draft") for x in f)
if strip_trashed: is_tr = any(sql_lit_equals(x, "trash") for x in f)
drop = is_rev or is_ad or is_tr
if drop:
if id_idx < len(f):
removed_post_ids.add(strip_quotes(f[id_idx]))
dropped_here += 1
continue
kept.append(tup)
if verbose and dropped_here:
print(f"[posts] dropped {dropped_here} rows (rev/auto-draft/trash)", file=sys.stderr)
except ValueError:
return line
elif table == postmeta_table:
try:
if col_list:
pid_idx = col_list.index("post_id")
mk_idx = col_list.index("meta_key") if strip_oembed else None
else:
pid_idx = 1
mk_idx = 2 if strip_oembed else None
dropped_here = 0
for tup in tuples:
f = values_tuple_to_fields(tup)
if not f: continue
post_id = strip_quotes(f[pid_idx]) if pid_idx < len(f) else None
if post_id and post_id in removed_post_ids:
dropped_here += 1
continue
if strip_oembed and mk_idx is not None and mk_idx < len(f):
mk = f[mk_idx]
if sql_lit_startswith(mk, "_oembed_") or sql_lit_startswith(mk, "_oembed_time_"):
dropped_here += 1
continue
kept.append(tup)
if verbose and dropped_here:
print(f"[postmeta] dropped {dropped_here} rows (cascade/oEmbed)", file=sys.stderr)
except ValueError:
return line
elif table == options_table and strip_oembed:
try:
if col_list:
on_idx = col_list.index("option_name")
else:
on_idx = 1
dropped_here = 0
for tup in tuples:
f = values_tuple_to_fields(tup)
if not f: continue
opt_name = f[on_idx] if on_idx < len(f) else ""
if (sql_lit_startswith(opt_name, "_transient_oembed_") or
sql_lit_startswith(opt_name, "_transient_timeout_oembed_")):
dropped_here += 1
continue
kept.append(tup)
if verbose and dropped_here:
print(f"[options] dropped {dropped_here} rows (oEmbed transients)", file=sys.stderr)
except ValueError:
return line
if not kept:
return ""
return reconstruct_insert(table, cols if cols else "", kept, tail)
bytes_read = 0
next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
buffer = []
in_block = False
keep_block = False
current_table = ""
for raw in infile:
line = raw
bytes_read += len(line)
if next_mark and bytes_read >= next_mark:
print(f"[filter] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
next_mark += progress_mb * 1024 * 1024
t = grab_table(line)
semicolon = statement_ends(line)
if t and not in_block:
in_block = True
current_table = t
keep_block = should_keep_table(t)
if verbose and t.startswith(keep_prefix):
if not keep_block:
print(f"[drop-like] excluding table: {t}", file=sys.stderr)
if keep_block:
if line.lstrip().lower().startswith("insert into"):
buffer.append(line)
if semicolon:
full = ''.join(buffer); buffer.clear()
mod = process_insert_line(full)
if mod: outfile.write(mod)
else:
if buffer:
buffer.append(line)
if semicolon:
full = ''.join(buffer); buffer.clear()
outfile.write(full)
else:
outfile.write(line)
else:
if buffer:
buffer.append(line)
if semicolon: buffer.clear()
else:
if line.lstrip().lower().startswith("insert into"):
buffer.append(line)
if semicolon: buffer.clear()
if in_block and semicolon:
in_block = False
keep_block = False
current_table = ""
# ---- CLI ----
def main():
p = argparse.ArgumentParser(
description="Filter MySQL dump by table prefix; list/report; strip WP junk; progress; drop-like patterns.",
formatter_class=argparse.RawTextHelpFormatter
)
p.add_argument("-i", dest="infile", default=os.path.expanduser("~/dump.sql"),
help="Input dump (.sql or .sql.gz). Default: ~/dump.sql")
p.add_argument("-o", dest="outfile", default=os.path.expanduser("~/dump.filtered.sql"),
help="Output file for filtered mode (.sql or .sql.gz). Default: ~/dump.filtered.sql")
p.add_argument("-p", dest="prefix", default="7CHIc_", help="Table prefix to KEEP. Default: 7CHIc_")
p.add_argument("-v", "--verbose", action="store_true", help="Verbose logs to stderr.")
p.add_argument("--progress-every-mb", type=int, default=0, help="Print progress every N MB read.")
p.add_argument("--list", dest="mode", action="store_const", const="list", help="List tables grouped by prefix.")
p.add_argument("--report", dest="mode", action="store_const", const="report", help="Detailed dry-run report.")
# WP cleanup toggles
p.add_argument("--strip-revisions", action="store_true", help="Remove wp_posts revisions + related postmeta.")
p.add_argument("--strip-auto-drafts", action="store_true", help="Remove wp_posts auto-drafts + related postmeta.")
p.add_argument("--strip-trashed", action="store_true", help="Remove wp_posts with post_status='trash' + postmeta.")
p.add_argument("--strip-oembed", action="store_true", help="Remove oEmbed caches (postmeta + options transients).")
p.add_argument("--strip-wp-junk", action="store_true",
help="Enable all: --strip-revisions --strip-auto-drafts --strip-trashed --strip-oembed")
p.add_argument("--wp-prefix", dest="wp_prefix", default=None,
help="WordPress table prefix for cleanup (defaults to -p).")
# Extra: drop-like table patterns (repeatable)
p.add_argument("--drop-like", action="append", default=[], metavar="PATTERN",
help="Glob pattern to exclude tables (e.g. '*_aioseo_cache'). Repeatable.")
args = p.parse_args()
mode = args.mode or "filter"
infile_path = os.path.expanduser(args.infile)
outfile_path = os.path.expanduser(args.outfile)
keep_prefix = args.prefix
wp_prefix = args.wp_prefix or keep_prefix
if not os.path.isfile(infile_path):
print(f"Input not found: {infile_path}", file=sys.stderr)
sys.exit(1)
# Expand shorthand
strip_revisions = args.strip_revisions or args.strip_wp_junk
strip_autodrafts = args.strip_auto_drafts or args.strip_wp_junk
strip_trashed = args.strip_trashed or args.strip_wp_junk
strip_oembed = args.strip_oembed or args.strip_wp_junk
drop_patterns = args.drop_like or []
with open_input(infile_path) as fin:
if mode == "report":
mode_report(fin, keep_prefix, infile_path, drop_patterns, verbose=args.verbose, progress_mb=args.progress_every_mb)
elif mode == "list":
mode_list(fin, keep_prefix, drop_patterns, progress_mb=args.progress_every_mb)
else:
with open_output(outfile_path) as fout:
filter_and_write(fin, fout, keep_prefix,
strip_revisions, strip_autodrafts,
strip_trashed, strip_oembed,
wp_prefix, drop_patterns,
verbose=args.verbose, progress_mb=args.progress_every_mb)
print(f"✅ Done. Wrote filtered dump to: {outfile_path}")
if __name__ == "__main__":
main()

clean_sql_by_prefix.py

A streaming MySQL dump cleaner in Python 3 that:

  • Keeps only SQL statements for a chosen table prefix (e.g., wp_ or 7CHIc_).
  • Provides list and report dry-run modes to understand dump structure.
  • Optionally strips WordPress junk before import:
    • Revisions (post_type='revision') from <prefix>posts and their matching <prefix>postmeta rows.
    • Auto-drafts (post_status='auto-draft') and trashed posts (post_status='trash') + related postmeta.
    • oEmbed caches in <prefix>postmeta (meta_key LIKE '_oembed_%') and <prefix>options (_transient_oembed_*).
  • Works with .sql and .sql.gz; uses low memory (streams line-by-line).
  • Includes --progress-every-mb, -v/--verbose, and --drop-like glob patterns to exclude certain tables.

Perfect for multi-site dumps where you only want a single site’s tables and a cleaner WordPress dataset.


Quick Start

# Make executable
chmod +x clean_sql_by_prefix.py

# (A) Report structure and stats
./clean_sql_by_prefix.py -i ~/dump.sql.gz --report -p 7CHIc_ --progress-every-mb 64

# (B) List tables per prefix (grouped)
./clean_sql_by_prefix.py -i ~/dump.sql --list -p 7CHIc_

# (C) Filter to prefix only
./clean_sql_by_prefix.py -i ~/dump.sql -o ~/dump.filtered.sql -p 7CHIc_

# (D) Filter + prune WordPress junk (revisions, auto-drafts, trashed, oembed)
./clean_sql_by_prefix.py -i ~/dump.sql -o ~/dump.filtered.sql -p 7CHIc_ --strip-wp-junk -v --progress-every-mb 32

# (E) Exclude noisy cache tables even if they match the prefix
./clean_sql_by_prefix.py -i ~/dump.sql -o ~/dump.filtered.sql -p 7CHIc_ \
  --strip-wp-junk --drop-like '*_aioseo_cache' --drop-like '*_revslider_static_slides'

Your earlier command (adapted/corrected):

./clean_sql_by_prefix.py \
  -i ~/sevenmentor/wp7mdb_22m2022_2025-08-06_05-48-28.sql \
  -o ~/sevenmentor/dump.filtered-stripped.sql \
  -p 7CHIc_ \
  --strip-revisions --strip-wp-junk \
  -v --progress-every-mb 32 \
  --drop-like '*_aioseo_cache'

Installation

No dependencies beyond Python 3.8+. Works on Linux/macOS; Windows via WSL or Python.

# Clone or download the script from the Gist, then:
chmod +x clean_sql_by_prefix.py
python3 --version  # ensure 3.8+

.gz support for input and output is built-in.


Usage

usage: clean_sql_by_prefix.py [-h] [-i INFILE] [-o OUTFILE] [-p PREFIX] [-v]
                              [--progress-every-mb N] [--list | --report]
                              [--strip-revisions] [--strip-auto-drafts]
                              [--strip-trashed] [--strip-oembed] [--strip-wp-junk]
                              [--wp-prefix WP_PREFIX]
                              [--drop-like PATTERN]...

Modes

  • Default (filter): write only statements for the chosen prefix to -o.
  • --list: show unique tables grouped by prefix, with KEEP/DROP headers.
  • --report: detailed dry run with per-prefix statement-type counts, sample tables, and cross-prefix collisions.

WordPress Cleanup Options

  • --strip-revisions — drop rows in <prefix>posts where post_type='revision' and cascade their IDs to <prefix>postmeta (drop matching post_id).
  • --strip-auto-drafts — drop rows with post_status='auto-draft' from <prefix>posts + their postmeta.
  • --strip-trashed — drop rows with post_status='trash' from <prefix>posts + their postmeta.
  • --strip-oembed — drop oEmbed caches in <prefix>postmeta (_oembed_%, _oembed_time_%) and <prefix>options (_transient_oembed_%, _transient_timeout_oembed_%).
  • --strip-wp-junk — convenience switch enabling all of the above.
  • --wp-prefix — override the WordPress logical prefix for cleanup if it differs from the filter prefix (rare).

Extra Controls

  • --drop-like PATTERN — exclude specific tables by glob pattern (repeatable), e.g. '*_aioseo_cache', 'wp_*_logs'.
  • --progress-every-mb N — print progress to stderr every N MB processed.
  • -v/--verbose — extra logs for what’s being dropped.

How It Works

  • Parses the dump streamingly and groups multi-line INSERT statements until ;.
  • Detects the table involved in INSERT/CREATE/ALTER/DROP/TRUNCATE/REPLACE/DELETE/LOCK and decides if it should be written based on -p and --drop-like.
  • For WordPress cleanup, it parses INSERT INTO <table> (...) VALUES (...),(...); and selectively removes tuples without loading the whole file in memory.

Safety & Tips

  • Always keep the original dump; write results to a new file (-o).
  • If INSERT uses column lists, detection is exact. If not, the script uses positions + literal scanning for edge cases.
  • After filtering, you can re-compress: gzip -9 dump.filtered.sql.
  • If you have multiple sites in a single dump, run --report first to confirm prefixes.

Performance

  • Line-by-line streaming; no full file load.
  • Multi-GB dumps are supported.
  • Use --progress-every-mb 64 on very large files to see progress.

Known Limitations

  • Exotic SQL formatting (e.g., custom delimiters, stored procedures with internal semicolons) isn’t the target. Standard mysqldump output is supported.
  • If you have non-standard column orders without explicit column lists, WordPress cleanup detection falls back to heuristics.

Troubleshooting

  • SyntaxError during copy/paste: grab raw from the Gist; ensure your shell didn’t mangle quotes.
  • No output written: ensure -p matches your tables; try --list to preview. Also check --drop-like patterns.
  • Revisions not removed: verify your WordPress tables use the same logical prefix as -p or pass --wp-prefix.
  • Windows: use WSL or run with python clean_sql_by_prefix.py.

FAQ

Q: Can I pipe input? A: Not currently. Provide -i pointing to a file. (You can zcat dump.sql.gz > dump.sql first if needed.)

Q: Can I keep multiple prefixes at once? A: Use multiple passes or edit the script to accept a list of keep prefixes.

Q: Will it also drop orphaned postmeta for non-revision deletions? A: Yes for auto-drafts and trashed posts when those options are enabled; it cascades by post_id.


Changelog

  • 1.1.0 — Added --drop-like, --progress-every-mb, -v/--verbose; extended WP cleanup and robustness.
  • 1.0.0 — Initial Python port of Bash/AWK tool; added --strip-revisions and gzip support.

License

This project is licensed under the MIT License. See LICENSE.

MIT License
Copyright (c) 2025
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment