yashodhank · September 29, 2025 19:20
diff --git a/1.clean_sql_by_prefix.py b/1.clean_sql_by_prefix.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import argparse, gzip, io, os, re, sys, fnmatch
 from collections import defaultdict

 # ---- I/O ----
 def open_input(path: str):
    path = os.path.expanduser(path)
    if path.endswith(".gz"):
        return gzip.open(path, "rt", encoding="utf-8", errors="replace", newline="")
    return open(path, "rt", encoding="utf-8", errors="replace", newline="")

 def open_output(path: str):
    path = os.path.expanduser(path)
    d = os.path.dirname(path) or "."
    os.makedirs(d, exist_ok=True)
    if path.endswith(".gz"):
        raw = gzip.open(path, "wb")
        return io.TextIOWrapper(raw, encoding="utf-8", newline="")
    return open(path, "wt", encoding="utf-8", newline="")

 # ---- SQL tokenization helpers ----
 TABLE_TOK = re.compile(
    r"""(?ix)
    \b(INSERT\s+INTO|CREATE\s+TABLE|DROP\s+TABLE|LOCK\s+TABLES|ALTER\s+TABLE|
       TRUNCATE\s+TABLE|REPLACE\s+INTO|DELETE\s+FROM)\s+([^\s(;]+)
    """
 )
 COMMENT_TABLE_BACKTICK = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+`([^`]+)`""")
 COMMENT_TABLE_QUOTE   = re.compile(r"""(?i)^--\s+(?:Table structure for table|Dumping data for table)\s+"([^"]+)"""")
 BANG_ALTER_KEYS       = re.compile(r"""(?i)/\*![0-9]+\s+ALTER\s+TABLE\s+([^\s]+)\s+(?:DISABLE|ENABLE)\s+KEYS""")

 TYPE_MATCHERS = [
    (re.compile(r'(?i)^\s*INSERT\s+INTO\s'), "INSERT"),
    (re.compile(r'(?i)^\s*CREATE\s+TABLE\s'), "CREATE"),
    (re.compile(r'(?i)^\s*ALTER\s+TABLE\s'),  "ALTER"),
    (re.compile(r'(?i)^\s*DROP\s+TABLE\s'),   "DROP"),
    (re.compile(r'(?i)^\s*TRUNCATE\s+TABLE\s'), "TRUNCATE"),
    (re.compile(r'(?i)^\s*REPLACE\s+INTO\s'), "REPLACE"),
    (re.compile(r'(?i)^\s*DELETE\s+FROM\s'),  "DELETE"),
    (re.compile(r'(?i)^\s*LOCK\s+TABLES'),    "LOCK"),
 ]

 QUOTE_CHARS = {'"', "'"}

 def clean_table(token: str) -> str:
    t = token.strip().strip('`"')
    if '.' in t:
        t = t.split('.', 1)[1]
    return t.strip('`"')

 def grab_table(line: str) -> str:
    m = TABLE_TOK.search(line)
    if m: return clean_table(m.group(2))
    m = COMMENT_TABLE_BACKTICK.match(line)
    if m: return clean_table(m.group(1))
    m = COMMENT_TABLE_QUOTE.match(line)
    if m: return clean_table(m.group(1))
    m = BANG_ALTER_KEYS.search(line)
    if m: return clean_table(m.group(1))
    return ""

 def prefix_of(tbl: str) -> str:
    m = re.match(r'^[A-Za-z0-9$]+_', tbl)
    return m.group(0) if m else "NO_PREFIX"

 def type_of(line: str) -> str:
    for rx, name in TYPE_MATCHERS:
        if rx.search(line): return name
    return "OTHER"

 def statement_ends(line: str) -> bool:
    return line.rstrip().endswith(';')

 # ---- INSERT parsing ----
 INSERT_HEAD_RE = re.compile(
    r'(?is)^\s*INSERT\s+INTO\s+(`?)(?P<table>[^\s(]+)\1\s*(?P<cols>\([^)]+\))?\s*VALUES\s*(?P<values>.+?)(?P<tail>\s+ON\s+DUPLICATE\s+KEY\s+UPDATE\s+.*)?;\s*$'
 )

 def split_values_tuples(values_blob: str):
    out, i, n = [], 0, len(values_blob)
    while i < n:
        while i < n and values_blob[i].isspace(): i += 1
        if i < n and values_blob[i] == ',': i += 1; continue
        if i >= n: break
        if values_blob[i] != '(': return [values_blob.strip()]
        depth = 0; start = i; in_str = False; esc = False; quote = ''
        while i < n:
            ch = values_blob[i]
            if in_str:
                if esc: esc = False
                elif ch == '\\': esc = True
                elif ch == quote: in_str = False
            else:
                if ch in QUOTE_CHARS: in_str = True; quote = ch
                elif ch == '(': depth += 1
                elif ch == ')':
                    depth -= 1
                    if depth == 0:
                        i += 1
                        out.append(values_blob[start:i].strip())
                        break
            i += 1
    return out

 def parse_columns_list(cols: str):
    if not cols: return None
    inner = cols.strip()[1:-1]
    raw, cur, in_str, esc, quote = [], [], False, False, ''
    for ch in inner:
        if in_str:
            if esc: esc = False
            elif ch == '\\': esc = True
            elif ch == quote: in_str = False
            cur.append(ch)
        else:
            if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
            elif ch == '`': cur.append(ch)
            elif ch == ',': raw.append(''.join(cur).strip().strip('`"').lower()); cur = []
            else: cur.append(ch)
    if cur: raw.append(''.join(cur).strip().strip('`"').lower())
    return [c for c in (t.strip() for t in raw) if c]

 def values_tuple_to_fields(tuple_text: str):
    assert tuple_text.startswith('(') and tuple_text.endswith(')')
    inner = tuple_text[1:-1]
    fields, cur = [], []
    in_str = False; esc = False; quote = ''; depth = 0
    for ch in inner:
        if in_str:
            cur.append(ch)
            if esc: esc = False
            elif ch == '\\': esc = True
            elif ch == quote: in_str = False
        else:
            if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
            elif ch == '(': depth += 1; cur.append(ch)
            elif ch == ')': depth -= 1; cur.append(ch)
            elif ch == ',' and depth == 0:
                fields.append(''.join(cur).strip()); cur = []
            else: cur.append(ch)
    if cur: fields.append(''.join(cur).strip())
    return fields

 def sql_lit_equals(sql_literal: str, target: str) -> bool:
    sl = sql_literal.strip()
    if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
        return sl[1:-1] == target
    return False

 def sql_lit_startswith(sql_literal: str, prefix: str) -> bool:
    sl = sql_literal.strip()
    if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
        return sl[1:-1].startswith(prefix)
    return False

 def strip_quotes(s: str) -> str:
    s = s.strip()
    if len(s) >= 2 and s[0] == s[-1] and s[0] in QUOTE_CHARS:
        return s[1:-1]
    return s

 def reconstruct_insert(table: str, cols: str, tuple_chunks, tail: str) -> str:
    head = f"INSERT INTO `{table}`"
    if cols: head += f" {cols}"
    head += " VALUES "
    body = ",".join(tuple_chunks)
    return f"{head}{body}{tail or ''};\n"

 # ---- Modes ----
 def mode_report(infile, keep_prefix: str, path_display: str, drop_patterns, verbose=False, progress_mb=0):
    tabs_by_pfx = defaultdict(int)
    total_tabs = 0
    stmt_counts = defaultdict(int)
    samples = defaultdict(list)
    seen_tables = set()
    base_seen = defaultdict(lambda: defaultdict(bool))
    base_pfx_count = defaultdict(int)
    bytes_read = 0
    next_mark = progress_mb * 1024 * 1024 if progress_mb else 0

    def register_table(t: str, line: str):
        nonlocal total_tabs
        pfx = prefix_of(t)
        key = (pfx, t)
        if key not in seen_tables:
            seen_tables.add(key)
            tabs_by_pfx[pfx] += 1
            total_tabs += 1
            if len(samples[pfx]) < 15: samples[pfx].append(t)
            m = re.match(r'^[^_]+_(.*)$', t)
            if m:
                base = m.group(1)
                if not base_seen[base][pfx]:
                    base_seen[base][pfx] = True
                    base_pfx_count[base] += 1
        st = type_of(line)
        stmt_counts[(pfx, st)] += 1

    for line in infile:
        bytes_read += len(line)
        if next_mark and bytes_read >= next_mark:
            print(f"[report] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
            next_mark += progress_mb * 1024 * 1024
        t = grab_table(line)
        if not t: continue
        # apply drop-like
        if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
        register_table(t, line)

    prefixes = sorted(tabs_by_pfx.keys())
    kept_tabs = sum(tabs_by_pfx[p] for p in prefixes if p == keep_prefix)
    drop_tabs = sum(tabs_by_pfx[p] for p in prefixes if p != keep_prefix)
    total_kd = kept_tabs + drop_tabs
    kpct = (100.0 * kept_tabs / total_kd) if total_kd else 0.0
    dpct = (100.0 * drop_tabs / total_kd) if total_kd else 0.0

    print("=== SQL DUMP REPORT (DRY RUN) ===")
    print(f"Input file: {path_display}")
    print(f"Chosen KEEP prefix: {keep_prefix}")
    if drop_patterns:
        print(f"Extra drop-like patterns: {', '.join(drop_patterns)}")
    print(f"Distinct prefixes found: {len(prefixes)}")
    print(f"Distinct tables referenced: {total_tabs}\n")
    print(f"KEEP tables: {kept_tabs} ({kpct:.2f}%)")
    print(f"DROP tables: {drop_tabs} ({dpct:.2f}%)\n")

    typelist = ["CREATE","INSERT","ALTER","DROP","TRUNCATE","REPLACE","DELETE","LOCK","OTHER"]
    for p in prefixes:
        print(f"-- [{'KEEP' if p==keep_prefix else 'DROP'}] {p} --")
        print(f"  Unique tables: {tabs_by_pfx[p]}")
        for ty in typelist:
            c = stmt_counts[(p, ty)]
            if c: print(f"  {ty:<8}: {c}")
        if samples[p]:
            print("  Sample tables (up to 15):")
            for s in samples[p]: print(f"    - {s}")
            if tabs_by_pfx[p] > len(samples[p]):
                print(f"    ... (+{tabs_by_pfx[p]-len(samples[p])} more)")
        print()

    collisions = [b for b, n in base_pfx_count.items() if n > 1]
    if collisions:
        print(f"Cross-prefix collisions (same base table name across prefixes): {len(collisions)}")
        shown = 0
        for b in collisions:
            pf = [p for p in prefixes if base_seen[b][p]]
            print(f"  - {b}  (prefixes: {', '.join(pf)})")
            shown += 1
            if shown >= 15:
                print("  ... (+more)")
                break
        print()
    print("=== END REPORT ===")

 def mode_list(infile, keep_prefix: str, drop_patterns, progress_mb=0):
    tabs_by_pfx = defaultdict(list)
    seen = set()
    bytes_read = 0
    next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
    for line in infile:
        bytes_read += len(line)
        if next_mark and bytes_read >= next_mark:
            print(f"[list] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
            next_mark += progress_mb * 1024 * 1024
        t = grab_table(line)
        if not t: continue
        if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
        pfx = prefix_of(t)
        key = (pfx, t)
        if key not in seen:
            seen.add(key)
            tabs_by_pfx[pfx].append(t)

    print("== Tables found ==")
    for p in sorted(tabs_by_pfx.keys()):
        head = f"-- Will KEEP ({keep_prefix}) --" if p == keep_prefix else f"-- Will DROP (prefix {p}) --"
        print(head)
        for t in tabs_by_pfx[p]: print(t)
        print()

 # ---- Filter with WP junk stripping ----
 def filter_and_write(infile, outfile, keep_prefix: str,
                     strip_revisions: bool, strip_autodrafts: bool,
                     strip_trashed: bool, strip_oembed: bool,
                     wp_prefix: str, drop_patterns, verbose=False,
                     progress_mb=0):
    removed_post_ids = set()
    posts_table    = f"{wp_prefix}posts"
    postmeta_table = f"{wp_prefix}postmeta"
    options_table  = f"{wp_prefix}options"

    def table_is_dropped_by_pattern(tbl: str) -> bool:
        return any(fnmatch.fnmatch(tbl, pat) for pat in drop_patterns)

    def should_keep_table(tbl: str) -> bool:
        return tbl.startswith(keep_prefix) and not table_is_dropped_by_pattern(tbl)

    def process_insert_line(line: str) -> str:
        m = INSERT_HEAD_RE.match(line)
        if not m: return line
        table = clean_table(m.group('table'))
        cols  = m.group('cols')
        vals  = m.group('values')
        tail  = m.group('tail') or ""

        if table not in (posts_table, postmeta_table, options_table):
            return line

        tuples = split_values_tuples(vals)
        if not tuples: return line
        col_list = parse_columns_list(cols)
        kept = []

        if table == posts_table:
            try:
                if col_list:
                    id_idx = col_list.index("id")
                    pt_idx = col_list.index("post_type")
                    ps_idx = col_list.index("post_status")
                else:
                    id_idx = 0
                    pt_idx = None
                    ps_idx = None
                dropped_here = 0
                for tup in tuples:
                    f = values_tuple_to_fields(tup)
                    if not f: continue
                    is_rev = is_ad = is_tr = False
                    if strip_revisions:
                        if pt_idx is not None and pt_idx < len(f):
                            is_rev = sql_lit_equals(f[pt_idx], "revision")
                        else:
                            is_rev = any(sql_lit_equals(x, "revision") for x in f)
                    if strip_autodrafts or strip_trashed:
                        if ps_idx is not None and ps_idx < len(f):
                            if strip_autodrafts: is_ad = sql_lit_equals(f[ps_idx], "auto-draft")
                            if strip_trashed:    is_tr = sql_lit_equals(f[ps_idx], "trash")
                        else:
                            if strip_autodrafts: is_ad = any(sql_lit_equals(x, "auto-draft") for x in f)
                            if strip_trashed:    is_tr = any(sql_lit_equals(x, "trash") for x in f)
                    drop = is_rev or is_ad or is_tr
                    if drop:
                        if id_idx < len(f):
                            removed_post_ids.add(strip_quotes(f[id_idx]))
                        dropped_here += 1
                        continue
                    kept.append(tup)
                if verbose and dropped_here:
                    print(f"[posts] dropped {dropped_here} rows (rev/auto-draft/trash)", file=sys.stderr)
            except ValueError:
                return line

        elif table == postmeta_table:
            try:
                if col_list:
                    pid_idx = col_list.index("post_id")
                    mk_idx  = col_list.index("meta_key") if strip_oembed else None
                else:
                    pid_idx = 1
                    mk_idx  = 2 if strip_oembed else None
                dropped_here = 0
                for tup in tuples:
                    f = values_tuple_to_fields(tup)
                    if not f: continue
                    post_id = strip_quotes(f[pid_idx]) if pid_idx < len(f) else None
                    if post_id and post_id in removed_post_ids:
                        dropped_here += 1
                        continue
                    if strip_oembed and mk_idx is not None and mk_idx < len(f):
                        mk = f[mk_idx]
                        if sql_lit_startswith(mk, "_oembed_") or sql_lit_startswith(mk, "_oembed_time_"):
                            dropped_here += 1
                            continue
                    kept.append(tup)
                if verbose and dropped_here:
                    print(f"[postmeta] dropped {dropped_here} rows (cascade/oEmbed)", file=sys.stderr)
            except ValueError:
                return line

        elif table == options_table and strip_oembed:
            try:
                if col_list:
                    on_idx = col_list.index("option_name")
                else:
                    on_idx = 1
                dropped_here = 0
                for tup in tuples:
                    f = values_tuple_to_fields(tup)
                    if not f: continue
                    opt_name = f[on_idx] if on_idx < len(f) else ""
                    if (sql_lit_startswith(opt_name, "_transient_oembed_") or
                        sql_lit_startswith(opt_name, "_transient_timeout_oembed_")):
                        dropped_here += 1
                        continue
                    kept.append(tup)
                if verbose and dropped_here:
                    print(f"[options] dropped {dropped_here} rows (oEmbed transients)", file=sys.stderr)
            except ValueError:
                return line

        if not kept:
            return ""
        return reconstruct_insert(table, cols if cols else "", kept, tail)

    bytes_read = 0
    next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
    buffer = []
    in_block = False
    keep_block = False
    current_table = ""

    for raw in infile:
        line = raw
        bytes_read += len(line)
        if next_mark and bytes_read >= next_mark:
            print(f"[filter] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
            next_mark += progress_mb * 1024 * 1024

        t = grab_table(line)
        semicolon = statement_ends(line)
        if t and not in_block:
            in_block = True
            current_table = t
            keep_block = should_keep_table(t)
            if verbose and t.startswith(keep_prefix):
                if not keep_block:
                    print(f"[drop-like] excluding table: {t}", file=sys.stderr)

        if keep_block:
            if line.lstrip().lower().startswith("insert into"):
                buffer.append(line)
                if semicolon:
                    full = ''.join(buffer); buffer.clear()
                    mod = process_insert_line(full)
                    if mod: outfile.write(mod)
            else:
                if buffer:
                    buffer.append(line)
                    if semicolon:
                        full = ''.join(buffer); buffer.clear()
                        outfile.write(full)
                else:
                    outfile.write(line)
        else:
            if buffer:
                buffer.append(line)
                if semicolon: buffer.clear()
            else:
                if line.lstrip().lower().startswith("insert into"):
                    buffer.append(line)
                    if semicolon: buffer.clear()

        if in_block and semicolon:
            in_block = False
            keep_block = False
            current_table = ""

 # ---- CLI ----
 def main():
    p = argparse.ArgumentParser(
        description="Filter MySQL dump by table prefix; list/report; strip WP junk; progress; drop-like patterns.",
        formatter_class=argparse.RawTextHelpFormatter
    )
    p.add_argument("-i", dest="infile",  default=os.path.expanduser("~/dump.sql"),
                   help="Input dump (.sql or .sql.gz). Default: ~/dump.sql")
    p.add_argument("-o", dest="outfile", default=os.path.expanduser("~/dump.filtered.sql"),
                   help="Output file for filtered mode (.sql or .sql.gz). Default: ~/dump.filtered.sql")
    p.add_argument("-p", dest="prefix",  default="7CHIc_", help="Table prefix to KEEP. Default: 7CHIc_")
    p.add_argument("-v", "--verbose", action="store_true", help="Verbose logs to stderr.")
    p.add_argument("--progress-every-mb", type=int, default=0, help="Print progress every N MB read.")

    p.add_argument("--list",   dest="mode", action="store_const", const="list",   help="List tables grouped by prefix.")
    p.add_argument("--report", dest="mode", action="store_const", const="report", help="Detailed dry-run report.")

    # WP cleanup toggles
    p.add_argument("--strip-revisions", action="store_true", help="Remove wp_posts revisions + related postmeta.")
    p.add_argument("--strip-auto-drafts", action="store_true", help="Remove wp_posts auto-drafts + related postmeta.")
    p.add_argument("--strip-trashed", action="store_true", help="Remove wp_posts with post_status='trash' + postmeta.")
    p.add_argument("--strip-oembed", action="store_true", help="Remove oEmbed caches (postmeta + options transients).")
    p.add_argument("--strip-wp-junk", action="store_true",
                   help="Enable all: --strip-revisions --strip-auto-drafts --strip-trashed --strip-oembed")
    p.add_argument("--wp-prefix", dest="wp_prefix", default=None,
                   help="WordPress table prefix for cleanup (defaults to -p).")

    # Extra: drop-like table patterns (repeatable)
    p.add_argument("--drop-like", action="append", default=[], metavar="PATTERN",
                   help="Glob pattern to exclude tables (e.g. '*_aioseo_cache'). Repeatable.")

    args = p.parse_args()
    mode = args.mode or "filter"

    infile_path = os.path.expanduser(args.infile)
    outfile_path = os.path.expanduser(args.outfile)
    keep_prefix = args.prefix
    wp_prefix = args.wp_prefix or keep_prefix

    if not os.path.isfile(infile_path):
        print(f"Input not found: {infile_path}", file=sys.stderr)
        sys.exit(1)

    # Expand shorthand
    strip_revisions   = args.strip_revisions or args.strip_wp_junk
    strip_autodrafts  = args.strip_auto_drafts or args.strip_wp_junk
    strip_trashed     = args.strip_trashed or args.strip_wp_junk
    strip_oembed      = args.strip_oembed or args.strip_wp_junk

    drop_patterns = args.drop_like or []

    with open_input(infile_path) as fin:
        if mode == "report":
            mode_report(fin, keep_prefix, infile_path, drop_patterns, verbose=args.verbose, progress_mb=args.progress_every_mb)
        elif mode == "list":
            mode_list(fin, keep_prefix, drop_patterns, progress_mb=args.progress_every_mb)
        else:
            with open_output(outfile_path) as fout:
                filter_and_write(fin, fout, keep_prefix,
                                 strip_revisions, strip_autodrafts,
                                 strip_trashed, strip_oembed,
                                 wp_prefix, drop_patterns,
                                 verbose=args.verbose, progress_mb=args.progress_every_mb)
            print(f"✅ Done. Wrote filtered dump to: {outfile_path}")

 if __name__ == "__main__":
    main()
diff --git a/README.md b/README.md
diff --git a/LICENSE b/LICENSE
 MIT License

 Copyright (c) 2025

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import argparse, gzip, io, os, re, sys, fnmatch
	from collections import defaultdict

	# ---- I/O ----
	def open_input(path: str):
	path = os.path.expanduser(path)
	if path.endswith(".gz"):
	return gzip.open(path, "rt", encoding="utf-8", errors="replace", newline="")
	return open(path, "rt", encoding="utf-8", errors="replace", newline="")

	def open_output(path: str):
	path = os.path.expanduser(path)
	d = os.path.dirname(path) or "."
	os.makedirs(d, exist_ok=True)
	if path.endswith(".gz"):
	raw = gzip.open(path, "wb")
	return io.TextIOWrapper(raw, encoding="utf-8", newline="")
	return open(path, "wt", encoding="utf-8", newline="")

	# ---- SQL tokenization helpers ----
	TABLE_TOK = re.compile(
	r"""(?ix)
	\b(INSERT\s+INTO\|CREATE\s+TABLE\|DROP\s+TABLE\|LOCK\s+TABLES\|ALTER\s+TABLE\|
	TRUNCATE\s+TABLE\|REPLACE\s+INTO\|DELETE\s+FROM)\s+([^\s(;]+)
	"""
	)
	COMMENT_TABLE_BACKTICK = re.compile(r"""(?i)^--\s+(?:Table structure for table\|Dumping data for table)\s+`([^`]+)`""")
	COMMENT_TABLE_QUOTE = re.compile(r"""(?i)^--\s+(?:Table structure for table\|Dumping data for table)\s+"([^"]+)"""")
	BANG_ALTER_KEYS = re.compile(r"""(?i)/\*![0-9]+\s+ALTER\s+TABLE\s+([^\s]+)\s+(?:DISABLE\|ENABLE)\s+KEYS""")

	TYPE_MATCHERS = [
	(re.compile(r'(?i)^\s*INSERT\s+INTO\s'), "INSERT"),
	(re.compile(r'(?i)^\s*CREATE\s+TABLE\s'), "CREATE"),
	(re.compile(r'(?i)^\s*ALTER\s+TABLE\s'), "ALTER"),
	(re.compile(r'(?i)^\s*DROP\s+TABLE\s'), "DROP"),
	(re.compile(r'(?i)^\s*TRUNCATE\s+TABLE\s'), "TRUNCATE"),
	(re.compile(r'(?i)^\s*REPLACE\s+INTO\s'), "REPLACE"),
	(re.compile(r'(?i)^\s*DELETE\s+FROM\s'), "DELETE"),
	(re.compile(r'(?i)^\s*LOCK\s+TABLES'), "LOCK"),
	]

	QUOTE_CHARS = {'"', "'"}

	def clean_table(token: str) -> str:
	t = token.strip().strip('`"')
	if '.' in t:
	t = t.split('.', 1)[1]
	return t.strip('`"')

	def grab_table(line: str) -> str:
	m = TABLE_TOK.search(line)
	if m: return clean_table(m.group(2))
	m = COMMENT_TABLE_BACKTICK.match(line)
	if m: return clean_table(m.group(1))
	m = COMMENT_TABLE_QUOTE.match(line)
	if m: return clean_table(m.group(1))
	m = BANG_ALTER_KEYS.search(line)
	if m: return clean_table(m.group(1))
	return ""

	def prefix_of(tbl: str) -> str:
	m = re.match(r'^[A-Za-z0-9$]+_', tbl)
	return m.group(0) if m else "NO_PREFIX"

	def type_of(line: str) -> str:
	for rx, name in TYPE_MATCHERS:
	if rx.search(line): return name
	return "OTHER"

	def statement_ends(line: str) -> bool:
	return line.rstrip().endswith(';')

	# ---- INSERT parsing ----
	INSERT_HEAD_RE = re.compile(
	r'(?is)^\sINSERT\s+INTO\s+(`?)(?P<table>[^\s(]+)\1\s(?P<cols>\([^)]+\))?\sVALUES\s(?P<values>.+?)(?P<tail>\s+ON\s+DUPLICATE\s+KEY\s+UPDATE\s+.)?;\s$'
	)

	def split_values_tuples(values_blob: str):
	out, i, n = [], 0, len(values_blob)
	while i < n:
	while i < n and values_blob[i].isspace(): i += 1
	if i < n and values_blob[i] == ',': i += 1; continue
	if i >= n: break
	if values_blob[i] != '(': return [values_blob.strip()]
	depth = 0; start = i; in_str = False; esc = False; quote = ''
	while i < n:
	ch = values_blob[i]
	if in_str:
	if esc: esc = False
	elif ch == '\\': esc = True
	elif ch == quote: in_str = False
	else:
	if ch in QUOTE_CHARS: in_str = True; quote = ch
	elif ch == '(': depth += 1
	elif ch == ')':
	depth -= 1
	if depth == 0:
	i += 1
	out.append(values_blob[start:i].strip())
	break
	i += 1
	return out

	def parse_columns_list(cols: str):
	if not cols: return None
	inner = cols.strip()[1:-1]
	raw, cur, in_str, esc, quote = [], [], False, False, ''
	for ch in inner:
	if in_str:
	if esc: esc = False
	elif ch == '\\': esc = True
	elif ch == quote: in_str = False
	cur.append(ch)
	else:
	if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
	elif ch == '`': cur.append(ch)
	elif ch == ',': raw.append(''.join(cur).strip().strip('`"').lower()); cur = []
	else: cur.append(ch)
	if cur: raw.append(''.join(cur).strip().strip('`"').lower())
	return [c for c in (t.strip() for t in raw) if c]

	def values_tuple_to_fields(tuple_text: str):
	assert tuple_text.startswith('(') and tuple_text.endswith(')')
	inner = tuple_text[1:-1]
	fields, cur = [], []
	in_str = False; esc = False; quote = ''; depth = 0
	for ch in inner:
	if in_str:
	cur.append(ch)
	if esc: esc = False
	elif ch == '\\': esc = True
	elif ch == quote: in_str = False
	else:
	if ch in QUOTE_CHARS: in_str = True; quote = ch; cur.append(ch)
	elif ch == '(': depth += 1; cur.append(ch)
	elif ch == ')': depth -= 1; cur.append(ch)
	elif ch == ',' and depth == 0:
	fields.append(''.join(cur).strip()); cur = []
	else: cur.append(ch)
	if cur: fields.append(''.join(cur).strip())
	return fields

	def sql_lit_equals(sql_literal: str, target: str) -> bool:
	sl = sql_literal.strip()
	if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
	return sl[1:-1] == target
	return False

	def sql_lit_startswith(sql_literal: str, prefix: str) -> bool:
	sl = sql_literal.strip()
	if len(sl) >= 2 and sl[0] == sl[-1] and sl[0] in QUOTE_CHARS:
	return sl[1:-1].startswith(prefix)
	return False

	def strip_quotes(s: str) -> str:
	s = s.strip()
	if len(s) >= 2 and s[0] == s[-1] and s[0] in QUOTE_CHARS:
	return s[1:-1]
	return s

	def reconstruct_insert(table: str, cols: str, tuple_chunks, tail: str) -> str:
	head = f"INSERT INTO `{table}`"
	if cols: head += f" {cols}"
	head += " VALUES "
	body = ",".join(tuple_chunks)
	return f"{head}{body}{tail or ''};\n"

	# ---- Modes ----
	def mode_report(infile, keep_prefix: str, path_display: str, drop_patterns, verbose=False, progress_mb=0):
	tabs_by_pfx = defaultdict(int)
	total_tabs = 0
	stmt_counts = defaultdict(int)
	samples = defaultdict(list)
	seen_tables = set()
	base_seen = defaultdict(lambda: defaultdict(bool))
	base_pfx_count = defaultdict(int)
	bytes_read = 0
	next_mark = progress_mb * 1024 * 1024 if progress_mb else 0

	def register_table(t: str, line: str):
	nonlocal total_tabs
	pfx = prefix_of(t)
	key = (pfx, t)
	if key not in seen_tables:
	seen_tables.add(key)
	tabs_by_pfx[pfx] += 1
	total_tabs += 1
	if len(samples[pfx]) < 15: samples[pfx].append(t)
	m = re.match(r'^[^_]+_(.*)$', t)
	if m:
	base = m.group(1)
	if not base_seen[base][pfx]:
	base_seen[base][pfx] = True
	base_pfx_count[base] += 1
	st = type_of(line)
	stmt_counts[(pfx, st)] += 1

	for line in infile:
	bytes_read += len(line)
	if next_mark and bytes_read >= next_mark:
	print(f"[report] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
	next_mark += progress_mb * 1024 * 1024
	t = grab_table(line)
	if not t: continue
	# apply drop-like
	if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
	register_table(t, line)

	prefixes = sorted(tabs_by_pfx.keys())
	kept_tabs = sum(tabs_by_pfx[p] for p in prefixes if p == keep_prefix)
	drop_tabs = sum(tabs_by_pfx[p] for p in prefixes if p != keep_prefix)
	total_kd = kept_tabs + drop_tabs
	kpct = (100.0 * kept_tabs / total_kd) if total_kd else 0.0
	dpct = (100.0 * drop_tabs / total_kd) if total_kd else 0.0

	print("=== SQL DUMP REPORT (DRY RUN) ===")
	print(f"Input file: {path_display}")
	print(f"Chosen KEEP prefix: {keep_prefix}")
	if drop_patterns:
	print(f"Extra drop-like patterns: {', '.join(drop_patterns)}")
	print(f"Distinct prefixes found: {len(prefixes)}")
	print(f"Distinct tables referenced: {total_tabs}\n")
	print(f"KEEP tables: {kept_tabs} ({kpct:.2f}%)")
	print(f"DROP tables: {drop_tabs} ({dpct:.2f}%)\n")

	typelist = ["CREATE","INSERT","ALTER","DROP","TRUNCATE","REPLACE","DELETE","LOCK","OTHER"]
	for p in prefixes:
	print(f"-- [{'KEEP' if p==keep_prefix else 'DROP'}] {p} --")
	print(f" Unique tables: {tabs_by_pfx[p]}")
	for ty in typelist:
	c = stmt_counts[(p, ty)]
	if c: print(f" {ty:<8}: {c}")
	if samples[p]:
	print(" Sample tables (up to 15):")
	for s in samples[p]: print(f" - {s}")
	if tabs_by_pfx[p] > len(samples[p]):
	print(f" ... (+{tabs_by_pfx[p]-len(samples[p])} more)")
	print()

	collisions = [b for b, n in base_pfx_count.items() if n > 1]
	if collisions:
	print(f"Cross-prefix collisions (same base table name across prefixes): {len(collisions)}")
	shown = 0
	for b in collisions:
	pf = [p for p in prefixes if base_seen[b][p]]
	print(f" - {b} (prefixes: {', '.join(pf)})")
	shown += 1
	if shown >= 15:
	print(" ... (+more)")
	break
	print()
	print("=== END REPORT ===")

	def mode_list(infile, keep_prefix: str, drop_patterns, progress_mb=0):
	tabs_by_pfx = defaultdict(list)
	seen = set()
	bytes_read = 0
	next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
	for line in infile:
	bytes_read += len(line)
	if next_mark and bytes_read >= next_mark:
	print(f"[list] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
	next_mark += progress_mb * 1024 * 1024
	t = grab_table(line)
	if not t: continue
	if any(fnmatch.fnmatch(t, pat) for pat in drop_patterns): continue
	pfx = prefix_of(t)
	key = (pfx, t)
	if key not in seen:
	seen.add(key)
	tabs_by_pfx[pfx].append(t)

	print("== Tables found ==")
	for p in sorted(tabs_by_pfx.keys()):
	head = f"-- Will KEEP ({keep_prefix}) --" if p == keep_prefix else f"-- Will DROP (prefix {p}) --"
	print(head)
	for t in tabs_by_pfx[p]: print(t)
	print()

	# ---- Filter with WP junk stripping ----
	def filter_and_write(infile, outfile, keep_prefix: str,
	strip_revisions: bool, strip_autodrafts: bool,
	strip_trashed: bool, strip_oembed: bool,
	wp_prefix: str, drop_patterns, verbose=False,
	progress_mb=0):
	removed_post_ids = set()
	posts_table = f"{wp_prefix}posts"
	postmeta_table = f"{wp_prefix}postmeta"
	options_table = f"{wp_prefix}options"

	def table_is_dropped_by_pattern(tbl: str) -> bool:
	return any(fnmatch.fnmatch(tbl, pat) for pat in drop_patterns)

	def should_keep_table(tbl: str) -> bool:
	return tbl.startswith(keep_prefix) and not table_is_dropped_by_pattern(tbl)

	def process_insert_line(line: str) -> str:
	m = INSERT_HEAD_RE.match(line)
	if not m: return line
	table = clean_table(m.group('table'))
	cols = m.group('cols')
	vals = m.group('values')
	tail = m.group('tail') or ""

	if table not in (posts_table, postmeta_table, options_table):
	return line

	tuples = split_values_tuples(vals)
	if not tuples: return line
	col_list = parse_columns_list(cols)
	kept = []

	if table == posts_table:
	try:
	if col_list:
	id_idx = col_list.index("id")
	pt_idx = col_list.index("post_type")
	ps_idx = col_list.index("post_status")
	else:
	id_idx = 0
	pt_idx = None
	ps_idx = None
	dropped_here = 0
	for tup in tuples:
	f = values_tuple_to_fields(tup)
	if not f: continue
	is_rev = is_ad = is_tr = False
	if strip_revisions:
	if pt_idx is not None and pt_idx < len(f):
	is_rev = sql_lit_equals(f[pt_idx], "revision")
	else:
	is_rev = any(sql_lit_equals(x, "revision") for x in f)
	if strip_autodrafts or strip_trashed:
	if ps_idx is not None and ps_idx < len(f):
	if strip_autodrafts: is_ad = sql_lit_equals(f[ps_idx], "auto-draft")
	if strip_trashed: is_tr = sql_lit_equals(f[ps_idx], "trash")
	else:
	if strip_autodrafts: is_ad = any(sql_lit_equals(x, "auto-draft") for x in f)
	if strip_trashed: is_tr = any(sql_lit_equals(x, "trash") for x in f)
	drop = is_rev or is_ad or is_tr
	if drop:
	if id_idx < len(f):
	removed_post_ids.add(strip_quotes(f[id_idx]))
	dropped_here += 1
	continue
	kept.append(tup)
	if verbose and dropped_here:
	print(f"[posts] dropped {dropped_here} rows (rev/auto-draft/trash)", file=sys.stderr)
	except ValueError:
	return line

	elif table == postmeta_table:
	try:
	if col_list:
	pid_idx = col_list.index("post_id")
	mk_idx = col_list.index("meta_key") if strip_oembed else None
	else:
	pid_idx = 1
	mk_idx = 2 if strip_oembed else None
	dropped_here = 0
	for tup in tuples:
	f = values_tuple_to_fields(tup)
	if not f: continue
	post_id = strip_quotes(f[pid_idx]) if pid_idx < len(f) else None
	if post_id and post_id in removed_post_ids:
	dropped_here += 1
	continue
	if strip_oembed and mk_idx is not None and mk_idx < len(f):
	mk = f[mk_idx]
	if sql_lit_startswith(mk, "_oembed_") or sql_lit_startswith(mk, "_oembed_time_"):
	dropped_here += 1
	continue
	kept.append(tup)
	if verbose and dropped_here:
	print(f"[postmeta] dropped {dropped_here} rows (cascade/oEmbed)", file=sys.stderr)
	except ValueError:
	return line

	elif table == options_table and strip_oembed:
	try:
	if col_list:
	on_idx = col_list.index("option_name")
	else:
	on_idx = 1
	dropped_here = 0
	for tup in tuples:
	f = values_tuple_to_fields(tup)
	if not f: continue
	opt_name = f[on_idx] if on_idx < len(f) else ""
	if (sql_lit_startswith(opt_name, "_transient_oembed_") or
	sql_lit_startswith(opt_name, "_transient_timeout_oembed_")):
	dropped_here += 1
	continue
	kept.append(tup)
	if verbose and dropped_here:
	print(f"[options] dropped {dropped_here} rows (oEmbed transients)", file=sys.stderr)
	except ValueError:
	return line

	if not kept:
	return ""
	return reconstruct_insert(table, cols if cols else "", kept, tail)

	bytes_read = 0
	next_mark = progress_mb * 1024 * 1024 if progress_mb else 0
	buffer = []
	in_block = False
	keep_block = False
	current_table = ""

	for raw in infile:
	line = raw
	bytes_read += len(line)
	if next_mark and bytes_read >= next_mark:
	print(f"[filter] {bytes_read//(1024*1024)} MB...", file=sys.stderr)
	next_mark += progress_mb * 1024 * 1024

	t = grab_table(line)
	semicolon = statement_ends(line)
	if t and not in_block:
	in_block = True
	current_table = t
	keep_block = should_keep_table(t)
	if verbose and t.startswith(keep_prefix):
	if not keep_block:
	print(f"[drop-like] excluding table: {t}", file=sys.stderr)

	if keep_block:
	if line.lstrip().lower().startswith("insert into"):
	buffer.append(line)
	if semicolon:
	full = ''.join(buffer); buffer.clear()
	mod = process_insert_line(full)
	if mod: outfile.write(mod)
	else:
	if buffer:
	buffer.append(line)
	if semicolon:
	full = ''.join(buffer); buffer.clear()
	outfile.write(full)
	else:
	outfile.write(line)
	else:
	if buffer:
	buffer.append(line)
	if semicolon: buffer.clear()
	else:
	if line.lstrip().lower().startswith("insert into"):
	buffer.append(line)
	if semicolon: buffer.clear()

	if in_block and semicolon:
	in_block = False
	keep_block = False
	current_table = ""

	# ---- CLI ----
	def main():
	p = argparse.ArgumentParser(
	description="Filter MySQL dump by table prefix; list/report; strip WP junk; progress; drop-like patterns.",
	formatter_class=argparse.RawTextHelpFormatter
	)
	p.add_argument("-i", dest="infile", default=os.path.expanduser("~/dump.sql"),
	help="Input dump (.sql or .sql.gz). Default: ~/dump.sql")
	p.add_argument("-o", dest="outfile", default=os.path.expanduser("~/dump.filtered.sql"),
	help="Output file for filtered mode (.sql or .sql.gz). Default: ~/dump.filtered.sql")
	p.add_argument("-p", dest="prefix", default="7CHIc_", help="Table prefix to KEEP. Default: 7CHIc_")
	p.add_argument("-v", "--verbose", action="store_true", help="Verbose logs to stderr.")
	p.add_argument("--progress-every-mb", type=int, default=0, help="Print progress every N MB read.")

	p.add_argument("--list", dest="mode", action="store_const", const="list", help="List tables grouped by prefix.")
	p.add_argument("--report", dest="mode", action="store_const", const="report", help="Detailed dry-run report.")

	# WP cleanup toggles
	p.add_argument("--strip-revisions", action="store_true", help="Remove wp_posts revisions + related postmeta.")
	p.add_argument("--strip-auto-drafts", action="store_true", help="Remove wp_posts auto-drafts + related postmeta.")
	p.add_argument("--strip-trashed", action="store_true", help="Remove wp_posts with post_status='trash' + postmeta.")
	p.add_argument("--strip-oembed", action="store_true", help="Remove oEmbed caches (postmeta + options transients).")
	p.add_argument("--strip-wp-junk", action="store_true",
	help="Enable all: --strip-revisions --strip-auto-drafts --strip-trashed --strip-oembed")
	p.add_argument("--wp-prefix", dest="wp_prefix", default=None,
	help="WordPress table prefix for cleanup (defaults to -p).")

	# Extra: drop-like table patterns (repeatable)
	p.add_argument("--drop-like", action="append", default=[], metavar="PATTERN",
	help="Glob pattern to exclude tables (e.g. '*_aioseo_cache'). Repeatable.")

	args = p.parse_args()
	mode = args.mode or "filter"

	infile_path = os.path.expanduser(args.infile)
	outfile_path = os.path.expanduser(args.outfile)
	keep_prefix = args.prefix
	wp_prefix = args.wp_prefix or keep_prefix

	if not os.path.isfile(infile_path):
	print(f"Input not found: {infile_path}", file=sys.stderr)
	sys.exit(1)

	# Expand shorthand
	strip_revisions = args.strip_revisions or args.strip_wp_junk
	strip_autodrafts = args.strip_auto_drafts or args.strip_wp_junk
	strip_trashed = args.strip_trashed or args.strip_wp_junk
	strip_oembed = args.strip_oembed or args.strip_wp_junk

	drop_patterns = args.drop_like or []

	with open_input(infile_path) as fin:
	if mode == "report":
	mode_report(fin, keep_prefix, infile_path, drop_patterns, verbose=args.verbose, progress_mb=args.progress_every_mb)
	elif mode == "list":
	mode_list(fin, keep_prefix, drop_patterns, progress_mb=args.progress_every_mb)
	else:
	with open_output(outfile_path) as fout:
	filter_and_write(fin, fout, keep_prefix,
	strip_revisions, strip_autodrafts,
	strip_trashed, strip_oembed,
	wp_prefix, drop_patterns,
	verbose=args.verbose, progress_mb=args.progress_every_mb)
	print(f"✅ Done. Wrote filtered dump to: {outfile_path}")

	if __name__ == "__main__":
	main()
	MIT License

	Copyright (c) 2025

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	SOFTWARE.