Created
March 12, 2026 04:21
-
-
Save robertdevore/86064e9e84fd8d39619442e45f4102cd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Split a .sql dump into chunks WITHOUT breaking SQL statements. | |
| - Splits only on semicolons that end a statement (outside strings/comments) | |
| - Keeps each chunk <= max_bytes (approx; never splits a statement) | |
| - Handles: single/double quotes, backticks, escaped quotes, --/# comments, /* */ comments | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| def split_sql_statements(input_path: str, output_dir: str, max_bytes: int) -> None: | |
| os.makedirs(output_dir, exist_ok=True) | |
| def out_path(i: int) -> str: | |
| return os.path.join(output_dir, f"chunk_{i:04d}.sql") | |
| chunk_i = 1 | |
| out_f = open(out_path(chunk_i), "wb") | |
| out_size = 0 | |
| stmt = bytearray() | |
| in_sq = False # ' | |
| in_dq = False # " | |
| in_bt = False # ` | |
| in_ml_comment = False # /* ... */ | |
| in_sl_comment = False # -- ... or # ... | |
| prev = None | |
| i = 0 | |
| data = open(input_path, "rb").read() | |
| n = len(data) | |
| def flush_statement(): | |
| nonlocal chunk_i, out_f, out_size, stmt | |
| if not stmt: | |
| return | |
| # If statement itself is bigger than max_bytes, we still have to write it whole. | |
| if out_size and (out_size + len(stmt)) > max_bytes: | |
| out_f.close() | |
| chunk_i += 1 | |
| out_f = open(out_path(chunk_i), "wb") | |
| out_size = 0 | |
| out_f.write(stmt) | |
| out_size += len(stmt) | |
| stmt = bytearray() | |
| while i < n: | |
| c = data[i] | |
| stmt.append(c) | |
| # Track start/end of single-line comments | |
| if not in_sq and not in_dq and not in_bt and not in_ml_comment: | |
| # Start of -- comment (must be followed by space, tab, or end-ish in many dumps; we'll treat "-- " and "--\t" as comment) | |
| if not in_sl_comment and c == ord('-') and prev == ord('-'): | |
| nxt = data[i+1] if i + 1 < n else None | |
| if nxt in (ord(' '), ord('\t')): | |
| in_sl_comment = True | |
| # Start of # comment | |
| if not in_sl_comment and c == ord('#'): | |
| in_sl_comment = True | |
| # End of single-line comment on newline | |
| if in_sl_comment and c == ord('\n'): | |
| in_sl_comment = False | |
| # Track start/end of multi-line comments | |
| if not in_sq and not in_dq and not in_bt and not in_sl_comment: | |
| # Start: /* | |
| if not in_ml_comment and prev == ord('/') and c == ord('*'): | |
| in_ml_comment = True | |
| # End: */ | |
| elif in_ml_comment and prev == ord('*') and c == ord('/'): | |
| in_ml_comment = False | |
| # Track quotes (ignore when inside comments) | |
| if not in_sl_comment and not in_ml_comment: | |
| # handle escapes inside strings | |
| if c == ord("'") and not in_dq and not in_bt: | |
| # toggle only if not escaped | |
| if prev != ord('\\'): | |
| in_sq = not in_sq | |
| elif c == ord('"') and not in_sq and not in_bt: | |
| if prev != ord('\\'): | |
| in_dq = not in_dq | |
| elif c == ord('`') and not in_sq and not in_dq: | |
| in_bt = not in_bt | |
| # Detect statement end: semicolon outside quotes/comments | |
| if (c == ord(';') | |
| and not in_sq and not in_dq and not in_bt | |
| and not in_sl_comment and not in_ml_comment): | |
| flush_statement() | |
| prev = c | |
| i += 1 | |
| # flush any trailing bytes | |
| flush_statement() | |
| if stmt: | |
| # leftover (e.g., dump without ending semicolon) — write it | |
| if out_size and (out_size + len(stmt)) > max_bytes: | |
| out_f.close() | |
| chunk_i += 1 | |
| out_f = open(out_path(chunk_i), "wb") | |
| out_f.write(stmt) | |
| out_f.close() | |
| print(f"Done. Wrote {chunk_i} chunk(s) to: {output_dir}") | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 3: | |
| print("Usage: split_sql_statements.py input.sql output_dir [max_bytes]") | |
| print("Example: split_sql_statements.py backup.sql sql_chunks 900000") | |
| sys.exit(1) | |
| input_path = sys.argv[1] | |
| output_dir = sys.argv[2] | |
| max_bytes = int(sys.argv[3]) if len(sys.argv) > 3 else 900_000 | |
| split_sql_statements(input_path, output_dir, max_bytes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment