Created
November 4, 2025 05:30
-
-
Save BjornFJohansson/31188a11a7f365ca5293e0e37dfb6347 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Clean underscored aliases while preserving original line breaks/spacing. | |
| Rules | |
| ----- | |
| 1) from X import Name as _Name -> from X import Name (and _Name → Name) | |
| 2) import pkg as _pkg -> import pkg (and _pkg → pkg) | |
| 3) import networkx as _nx -> import networkx as nx (and _nx → nx) | |
| Outputs: <original>_clean.py | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import re | |
| import sys | |
| from typing import Dict, List, Tuple | |
| # Types | |
| Plan = Dict[str, str] # alias -> replacement name | |
| FromBlock = Tuple[str, str, str, str] # (full_match, module, stuff, suffix) | |
| # --- helpers that DO NOT touch newlines --- | |
| def _normalize_dots_segment(s: str) -> str: | |
| """Only remove spaces/tabs around '.' (never newlines).""" | |
| return re.sub(r"[ \t]*\.[ \t]*", ".", s) | |
| def _split_top_level_commas(s: str) -> List[str]: | |
| parts, depth, buf = [], 0, [] | |
| for ch in s: | |
| if ch == "(": | |
| depth += 1 | |
| elif ch == ")": | |
| depth = max(0, depth - 1) | |
| if ch == "," and depth == 0: | |
| seg = "".join(buf).strip() | |
| if seg: | |
| parts.append(seg) | |
| buf = [] | |
| else: | |
| buf.append(ch) | |
| tail = "".join(buf).strip() | |
| if tail: | |
| parts.append(tail) | |
| return parts | |
| # --- collect plan --- | |
| FROM_RE = re.compile( | |
| r"(^[ \t]*from[ \t]+(?P<mod>[^\s]+)[ \t]+import[ \t]+(?P<stuff>.*?)" | |
| r")(?=(?<!\\)\n(?![ \t])|\Z)", | |
| re.DOTALL | re.MULTILINE, | |
| ) | |
| IMPORT_RE = re.compile( | |
| r"(^[ \t]*import[ \t]+(?P<tail>.*?))(?=(?<!\\)\n(?![ \t])|\Z)", | |
| re.DOTALL | re.MULTILINE, | |
| ) | |
| def _extract_from_blocks(code: str) -> List[FromBlock]: | |
| blocks: List[FromBlock] = [] | |
| for m in FROM_RE.finditer(code): | |
| full = m.group(1) | |
| mod = m.group("mod") | |
| stuff = m.group("stuff") | |
| # capture the suffix (text right after match up to the boundary) to keep exact newlines | |
| end = m.end() | |
| next_boundary = end | |
| blocks.append((full, mod, stuff, code[end:next_boundary])) | |
| return blocks | |
| def collect_plan(code: str) -> Plan: | |
| plan: Plan = {} | |
| # FROM blocks | |
| for m in FROM_RE.finditer(code): | |
| stuff = _normalize_dots_segment(m.group("stuff")) | |
| inner = stuff.strip() | |
| if inner.startswith("(") and inner.endswith(")"): | |
| inner = inner[1:-1] | |
| for item in _split_top_level_commas(inner): | |
| a = re.match(r"^(\w+)[ \t]+as[ \t]+([A-Za-z_]\w*)$", item) | |
| if not a: | |
| continue | |
| name, alias = a.group(1), a.group(2) | |
| if alias == f"_{name}": | |
| plan[alias] = name | |
| elif alias.startswith("_"): | |
| plan[alias] = alias.lstrip("_") | |
| # IMPORT lines | |
| for m in IMPORT_RE.finditer(code): | |
| tail = _normalize_dots_segment(m.group("tail")) | |
| for part in _split_top_level_commas(tail): | |
| a = re.match(r"^([A-Za-z_][\w\.]*)[ \t]+as[ \t]+([A-Za-z_]\w*)$", part) | |
| if not a: | |
| continue | |
| full, alias = a.group(1), a.group(2) | |
| base = full.split(".")[-1] | |
| if alias == f"_{base}": | |
| plan[alias] = base | |
| elif alias.startswith("_"): | |
| plan[alias] = alias.lstrip("_") | |
| return plan | |
| # --- rewrite imports (preserve original newlines/spacing) --- | |
| def rewrite_from_blocks(code: str, plan: Plan) -> str: | |
| def repl(m: re.Match) -> str: | |
| head = m.group(0) # full matched block (without trailing boundary) | |
| mod = m.group("mod") | |
| stuff = _normalize_dots_segment(m.group("stuff")) | |
| had_paren = stuff.strip().startswith("(") and stuff.strip().endswith(")") | |
| inner = stuff.strip()[1:-1] if had_paren else stuff | |
| parts = _split_top_level_commas(inner) | |
| new_parts: List[str] = [] | |
| for p in parts: | |
| a = re.match(r"^(\w+)[ \t]+as[ \t]+([A-Za-z_]\w*)$", p) | |
| if not a: | |
| new_parts.append(p) | |
| continue | |
| name, alias = a.group(1), a.group(2) | |
| if alias == f"_{name}": | |
| new_parts.append(name) # drop alias | |
| elif alias in plan and plan[alias] != alias: | |
| new_parts.append(f"{name} as {plan[alias]}") # _nx -> nx | |
| else: | |
| new_parts.append(p) | |
| inner_new = ", ".join(new_parts) | |
| if had_paren: | |
| inner_new = f"({inner_new})" | |
| # rebuild with the original leading whitespace preserved from head | |
| prefix_ws = re.match(r"^[ \t]*", head).group(0) | |
| return f"{prefix_ws}from {mod} import {inner_new}" | |
| return FROM_RE.sub(repl, code) | |
| def rewrite_import_lines(code: str, plan: Plan) -> str: | |
| def repl(m: re.Match) -> str: | |
| head = m.group(0) | |
| tail = _normalize_dots_segment(m.group("tail")) | |
| parts = _split_top_level_commas(tail) | |
| new_parts: List[str] = [] | |
| for p in parts: | |
| a = re.match(r"^([A-Za-z_][\w\.]*)[ \t]+as[ \t]+([A-Za-z_]\w*)$", p) | |
| if not a: | |
| new_parts.append(p) | |
| continue | |
| full, alias = a.group(1), a.group(2) | |
| base = full.split(".")[-1] | |
| if alias == f"_{base}": | |
| new_parts.append(full) # drop alias | |
| elif alias in plan and plan[alias] != alias: | |
| new_parts.append(f"{full} as {plan[alias]}") # _nx -> nx | |
| else: | |
| new_parts.append(p) | |
| prefix_ws = re.match(r"^[ \t]*", head).group(0) | |
| return f"{prefix_ws}import {', '.join(new_parts)}" | |
| return IMPORT_RE.sub(repl, code) | |
| # --- replace usages (leave line breaks untouched) --- | |
| def replace_usages(code: str, plan: Plan) -> str: | |
| # Longest alias first to avoid partial overlaps | |
| for alias, new in sorted(plan.items(), key=lambda kv: -len(kv[0])): | |
| code = re.sub(rf"\b{re.escape(alias)}\b", new, code) | |
| return code | |
| # --- end-to-end --- | |
| def transform(code: str) -> str: | |
| plan = collect_plan(code) | |
| if not plan: | |
| return code | |
| code2 = rewrite_from_blocks(code, plan) | |
| code3 = rewrite_import_lines(code2, plan) | |
| code4 = replace_usages(code3, plan) | |
| return code4 | |
| def process_path(p: Path) -> Path | None: | |
| src = p.read_text(encoding="utf-8") | |
| out = transform(src) | |
| if out == src: | |
| print(f"— No changes for {p.name}") | |
| return None | |
| dst = p.with_name(p.stem + "_clean.py") | |
| dst.write_text(out, encoding="utf-8") | |
| print(f"✔ Wrote {dst.name}") | |
| return dst | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python clean_aliases_preserve_lines.py <file-or-directory>") | |
| sys.exit(1) | |
| target = Path(sys.argv[1]) | |
| if target.is_file() and target.suffix == ".py": | |
| process_path(target) | |
| else: | |
| for f in target.rglob("*.py"): | |
| process_path(f) | |
| if __name__ == "__main__": | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment