Skip to content

Instantly share code, notes, and snippets.

@robertdevore
Created March 12, 2026 04:21
Show Gist options
  • Select an option

  • Save robertdevore/86064e9e84fd8d39619442e45f4102cd to your computer and use it in GitHub Desktop.

Select an option

Save robertdevore/86064e9e84fd8d39619442e45f4102cd to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Split a .sql dump into chunks WITHOUT breaking SQL statements.
- Splits only on semicolons that end a statement (outside strings/comments)
- Keeps each chunk <= max_bytes (approx; never splits a statement)
- Handles: single/double quotes, backticks, escaped quotes, --/# comments, /* */ comments
"""
from __future__ import annotations
import os
import sys
def split_sql_statements(input_path: str, output_dir: str, max_bytes: int) -> None:
os.makedirs(output_dir, exist_ok=True)
def out_path(i: int) -> str:
return os.path.join(output_dir, f"chunk_{i:04d}.sql")
chunk_i = 1
out_f = open(out_path(chunk_i), "wb")
out_size = 0
stmt = bytearray()
in_sq = False # '
in_dq = False # "
in_bt = False # `
in_ml_comment = False # /* ... */
in_sl_comment = False # -- ... or # ...
prev = None
i = 0
data = open(input_path, "rb").read()
n = len(data)
def flush_statement():
nonlocal chunk_i, out_f, out_size, stmt
if not stmt:
return
# If statement itself is bigger than max_bytes, we still have to write it whole.
if out_size and (out_size + len(stmt)) > max_bytes:
out_f.close()
chunk_i += 1
out_f = open(out_path(chunk_i), "wb")
out_size = 0
out_f.write(stmt)
out_size += len(stmt)
stmt = bytearray()
while i < n:
c = data[i]
stmt.append(c)
# Track start/end of single-line comments
if not in_sq and not in_dq and not in_bt and not in_ml_comment:
# Start of -- comment (must be followed by space, tab, or end-ish in many dumps; we'll treat "-- " and "--\t" as comment)
if not in_sl_comment and c == ord('-') and prev == ord('-'):
nxt = data[i+1] if i + 1 < n else None
if nxt in (ord(' '), ord('\t')):
in_sl_comment = True
# Start of # comment
if not in_sl_comment and c == ord('#'):
in_sl_comment = True
# End of single-line comment on newline
if in_sl_comment and c == ord('\n'):
in_sl_comment = False
# Track start/end of multi-line comments
if not in_sq and not in_dq and not in_bt and not in_sl_comment:
# Start: /*
if not in_ml_comment and prev == ord('/') and c == ord('*'):
in_ml_comment = True
# End: */
elif in_ml_comment and prev == ord('*') and c == ord('/'):
in_ml_comment = False
# Track quotes (ignore when inside comments)
if not in_sl_comment and not in_ml_comment:
# handle escapes inside strings
if c == ord("'") and not in_dq and not in_bt:
# toggle only if not escaped
if prev != ord('\\'):
in_sq = not in_sq
elif c == ord('"') and not in_sq and not in_bt:
if prev != ord('\\'):
in_dq = not in_dq
elif c == ord('`') and not in_sq and not in_dq:
in_bt = not in_bt
# Detect statement end: semicolon outside quotes/comments
if (c == ord(';')
and not in_sq and not in_dq and not in_bt
and not in_sl_comment and not in_ml_comment):
flush_statement()
prev = c
i += 1
# flush any trailing bytes
flush_statement()
if stmt:
# leftover (e.g., dump without ending semicolon) — write it
if out_size and (out_size + len(stmt)) > max_bytes:
out_f.close()
chunk_i += 1
out_f = open(out_path(chunk_i), "wb")
out_f.write(stmt)
out_f.close()
print(f"Done. Wrote {chunk_i} chunk(s) to: {output_dir}")
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: split_sql_statements.py input.sql output_dir [max_bytes]")
print("Example: split_sql_statements.py backup.sql sql_chunks 900000")
sys.exit(1)
input_path = sys.argv[1]
output_dir = sys.argv[2]
max_bytes = int(sys.argv[3]) if len(sys.argv) > 3 else 900_000
split_sql_statements(input_path, output_dir, max_bytes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment