robertdevore · March 12, 2026 04:21
diff --git a/sqlSplit.py b/sqlSplit.py
 #!/usr/bin/env python3
 """
 Split a .sql dump into chunks WITHOUT breaking SQL statements.

 - Splits only on semicolons that end a statement (outside strings/comments)
 - Keeps each chunk <= max_bytes (approx; never splits a statement)
 - Handles: single/double quotes, backticks, escaped quotes, --/# comments, /* */ comments
 """

 from __future__ import annotations
 import os
 import sys

 def split_sql_statements(input_path: str, output_dir: str, max_bytes: int) -> None:
    os.makedirs(output_dir, exist_ok=True)

    def out_path(i: int) -> str:
        return os.path.join(output_dir, f"chunk_{i:04d}.sql")

    chunk_i = 1
    out_f = open(out_path(chunk_i), "wb")
    out_size = 0

    stmt = bytearray()

    in_sq = False       # '
    in_dq = False       # "
    in_bt = False       # `
    in_ml_comment = False  # /* ... */
    in_sl_comment = False  # -- ... or # ...
    prev = None
    i = 0

    data = open(input_path, "rb").read()
    n = len(data)

    def flush_statement():
        nonlocal chunk_i, out_f, out_size, stmt

        if not stmt:
            return

        # If statement itself is bigger than max_bytes, we still have to write it whole.
        if out_size and (out_size + len(stmt)) > max_bytes:
            out_f.close()
            chunk_i += 1
            out_f = open(out_path(chunk_i), "wb")
            out_size = 0

        out_f.write(stmt)
        out_size += len(stmt)
        stmt = bytearray()

    while i < n:
        c = data[i]
        stmt.append(c)

        # Track start/end of single-line comments
        if not in_sq and not in_dq and not in_bt and not in_ml_comment:
            # Start of -- comment (must be followed by space, tab, or end-ish in many dumps; we'll treat "-- " and "--\t" as comment)
            if not in_sl_comment and c == ord('-') and prev == ord('-'):
                nxt = data[i+1] if i + 1 < n else None
                if nxt in (ord(' '), ord('\t')):
                    in_sl_comment = True
            # Start of # comment
            if not in_sl_comment and c == ord('#'):
                in_sl_comment = True

        # End of single-line comment on newline
        if in_sl_comment and c == ord('\n'):
            in_sl_comment = False

        # Track start/end of multi-line comments
        if not in_sq and not in_dq and not in_bt and not in_sl_comment:
            # Start: /*
            if not in_ml_comment and prev == ord('/') and c == ord('*'):
                in_ml_comment = True
            # End: */
            elif in_ml_comment and prev == ord('*') and c == ord('/'):
                in_ml_comment = False

        # Track quotes (ignore when inside comments)
        if not in_sl_comment and not in_ml_comment:
            # handle escapes inside strings
            if c == ord("'") and not in_dq and not in_bt:
                # toggle only if not escaped
                if prev != ord('\\'):
                    in_sq = not in_sq
            elif c == ord('"') and not in_sq and not in_bt:
                if prev != ord('\\'):
                    in_dq = not in_dq
            elif c == ord('`') and not in_sq and not in_dq:
                in_bt = not in_bt

        # Detect statement end: semicolon outside quotes/comments
        if (c == ord(';')
            and not in_sq and not in_dq and not in_bt
            and not in_sl_comment and not in_ml_comment):
            flush_statement()

        prev = c
        i += 1

    # flush any trailing bytes
    flush_statement()
    if stmt:
        # leftover (e.g., dump without ending semicolon) — write it
        if out_size and (out_size + len(stmt)) > max_bytes:
            out_f.close()
            chunk_i += 1
            out_f = open(out_path(chunk_i), "wb")
        out_f.write(stmt)

    out_f.close()
    print(f"Done. Wrote {chunk_i} chunk(s) to: {output_dir}")

 if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: split_sql_statements.py input.sql output_dir [max_bytes]")
        print("Example: split_sql_statements.py backup.sql sql_chunks 900000")
        sys.exit(1)

    input_path = sys.argv[1]
    output_dir = sys.argv[2]
    max_bytes = int(sys.argv[3]) if len(sys.argv) > 3 else 900_000

    split_sql_statements(input_path, output_dir, max_bytes)
	#!/usr/bin/env python3
	"""
	Split a .sql dump into chunks WITHOUT breaking SQL statements.

	- Splits only on semicolons that end a statement (outside strings/comments)
	- Keeps each chunk <= max_bytes (approx; never splits a statement)
	- Handles: single/double quotes, backticks, escaped quotes, --/# comments, /* */ comments
	"""

	from __future__ import annotations
	import os
	import sys

	def split_sql_statements(input_path: str, output_dir: str, max_bytes: int) -> None:
	os.makedirs(output_dir, exist_ok=True)

	def out_path(i: int) -> str:
	return os.path.join(output_dir, f"chunk_{i:04d}.sql")

	chunk_i = 1
	out_f = open(out_path(chunk_i), "wb")
	out_size = 0

	stmt = bytearray()

	in_sq = False # '
	in_dq = False # "
	in_bt = False # `
	in_ml_comment = False # /* ... */
	in_sl_comment = False # -- ... or # ...
	prev = None
	i = 0

	data = open(input_path, "rb").read()
	n = len(data)

	def flush_statement():
	nonlocal chunk_i, out_f, out_size, stmt

	if not stmt:
	return

	# If statement itself is bigger than max_bytes, we still have to write it whole.
	if out_size and (out_size + len(stmt)) > max_bytes:
	out_f.close()
	chunk_i += 1
	out_f = open(out_path(chunk_i), "wb")
	out_size = 0

	out_f.write(stmt)
	out_size += len(stmt)
	stmt = bytearray()

	while i < n:
	c = data[i]
	stmt.append(c)

	# Track start/end of single-line comments
	if not in_sq and not in_dq and not in_bt and not in_ml_comment:
	# Start of -- comment (must be followed by space, tab, or end-ish in many dumps; we'll treat "-- " and "--\t" as comment)
	if not in_sl_comment and c == ord('-') and prev == ord('-'):
	nxt = data[i+1] if i + 1 < n else None
	if nxt in (ord(' '), ord('\t')):
	in_sl_comment = True
	# Start of # comment
	if not in_sl_comment and c == ord('#'):
	in_sl_comment = True

	# End of single-line comment on newline
	if in_sl_comment and c == ord('\n'):
	in_sl_comment = False

	# Track start/end of multi-line comments
	if not in_sq and not in_dq and not in_bt and not in_sl_comment:
	# Start: /*
	if not in_ml_comment and prev == ord('/') and c == ord('*'):
	in_ml_comment = True
	# End: */
	elif in_ml_comment and prev == ord('*') and c == ord('/'):
	in_ml_comment = False

	# Track quotes (ignore when inside comments)
	if not in_sl_comment and not in_ml_comment:
	# handle escapes inside strings
	if c == ord("'") and not in_dq and not in_bt:
	# toggle only if not escaped
	if prev != ord('\\'):
	in_sq = not in_sq
	elif c == ord('"') and not in_sq and not in_bt:
	if prev != ord('\\'):
	in_dq = not in_dq
	elif c == ord('`') and not in_sq and not in_dq:
	in_bt = not in_bt

	# Detect statement end: semicolon outside quotes/comments
	if (c == ord(';')
	and not in_sq and not in_dq and not in_bt
	and not in_sl_comment and not in_ml_comment):
	flush_statement()

	prev = c
	i += 1

	# flush any trailing bytes
	flush_statement()
	if stmt:
	# leftover (e.g., dump without ending semicolon) — write it
	if out_size and (out_size + len(stmt)) > max_bytes:
	out_f.close()
	chunk_i += 1
	out_f = open(out_path(chunk_i), "wb")
	out_f.write(stmt)

	out_f.close()
	print(f"Done. Wrote {chunk_i} chunk(s) to: {output_dir}")

	if __name__ == "__main__":
	if len(sys.argv) < 3:
	print("Usage: split_sql_statements.py input.sql output_dir [max_bytes]")
	print("Example: split_sql_statements.py backup.sql sql_chunks 900000")
	sys.exit(1)

	input_path = sys.argv[1]
	output_dir = sys.argv[2]
	max_bytes = int(sys.argv[3]) if len(sys.argv) > 3 else 900_000

	split_sql_statements(input_path, output_dir, max_bytes)
No results found