Created
October 20, 2025 13:24
-
-
Save rafaellehmkuhl/f961126dec840642c7d673b7bd5f31f6 to your computer and use it in GitHub Desktop.
Repo to Markdown Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Generate a Markdown snapshot of the repository containing: | |
| 1) A file tree at the beginning | |
| 2) The name and content of all relevant files | |
| Relevance is determined by ignoring anything ignored by .gitignore and .dockerignore. | |
| Implementation details: | |
| - Uses `git ls-files --cached --others --exclude-standard` to respect .gitignore | |
| - Parses .dockerignore patterns (supports # comments, ! negations, *, **, ?, and directory patterns) | |
| - Skips binary files and files larger than a configurable size limit by default | |
| Usage: | |
| python3 generate_repo_markdown.py --output REPO_SNAPSHOT.md | |
| Options: | |
| --root <path> Root of the repo (defaults to git root if available, else CWD) | |
| --output <path> Output markdown file (default: REPO_SNAPSHOT.md at repo root) | |
| --max-bytes <int> Max file size to include in bytes (default: 524288 = 512 KiB) | |
| --include-binaries Include binary files (default: False) | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import sys | |
| import subprocess | |
| from pathlib import Path, PurePosixPath | |
| from typing import Iterable, List, Optional, Sequence, Tuple | |
| def run_command(args: Sequence[str], cwd: Optional[Path] = None) -> Tuple[int, str, str]: | |
| try: | |
| proc = subprocess.run( | |
| args, | |
| cwd=str(cwd) if cwd else None, | |
| capture_output=True, | |
| check=False, | |
| ) | |
| stdout = proc.stdout.decode(errors="replace") | |
| stderr = proc.stderr.decode(errors="replace") | |
| return proc.returncode, stdout, stderr | |
| except FileNotFoundError: | |
| return 127, "", f"command not found: {args[0]}" | |
| def find_git_root(start: Path) -> Optional[Path]: | |
| current = start.resolve() | |
| for _ in range(64): | |
| if (current / ".git").exists(): | |
| return current | |
| if current.parent == current: | |
| return None | |
| current = current.parent | |
| return None | |
| def list_repo_files_via_git(repo_root: Path) -> Optional[List[Path]]: | |
| code, out, _ = run_command(["git", "ls-files", "--cached", "--others", "--exclude-standard", "-z"], cwd=repo_root) | |
| if code != 0: | |
| return None | |
| # git -z separates with NUL | |
| rel_paths = [p for p in out.split("\x00") if p] | |
| return [repo_root / p for p in rel_paths] | |
| def list_repo_files_fallback(repo_root: Path) -> List[Path]: | |
| files: List[Path] = [] | |
| for root, dirs, filenames in os.walk(repo_root): | |
| # Basic exclusions to avoid huge noise when git is not available | |
| rel_root = Path(root).relative_to(repo_root) | |
| if str(rel_root).startswith(".git"): | |
| continue | |
| if "__pycache__" in rel_root.parts: | |
| continue | |
| for name in filenames: | |
| files.append(Path(root) / name) | |
| return files | |
| def read_dockerignore(repo_root: Path) -> List[str]: | |
| path = repo_root / ".dockerignore" | |
| if not path.exists(): | |
| return [] | |
| try: | |
| content = path.read_text(encoding="utf-8", errors="replace") | |
| except Exception: | |
| return [] | |
| lines: List[str] = [] | |
| for line in content.splitlines(): | |
| s = line.strip() | |
| if not s or s.startswith('#'): | |
| continue | |
| lines.append(s) | |
| return lines | |
| def _normalize_posix(path: Path, root: Path) -> str: | |
| rel = path.relative_to(root) | |
| # Normalize to POSIX-style for matching | |
| return str(PurePosixPath(rel.as_posix())) | |
| def _pattern_to_glob(pattern: str) -> Tuple[str, bool]: | |
| """ | |
| Convert a .dockerignore-like pattern to a glob usable by PurePosixPath.match. | |
| Returns (glob, directory_only) | |
| """ | |
| directory_only = pattern.endswith('/') | |
| pat = pattern[:-1] if directory_only else pattern | |
| anchored = pat.startswith('/') | |
| if anchored: | |
| pat = pat[1:] | |
| # If the pattern does not contain a slash, allow it to match anywhere in the path | |
| if not anchored and '/' not in pat: | |
| pat = f"**/{pat}" | |
| # For directory-only patterns, match everything under that directory | |
| if directory_only: | |
| pat = f"{pat}/**" | |
| return pat, directory_only | |
| def dockerignore_includes(path: Path, repo_root: Path, patterns: Sequence[str]) -> bool: | |
| """ | |
| Apply .dockerignore semantics (approximate): last match wins, default include. | |
| Supports: comments (already removed), !negation, *, **, ?, trailing / for dirs, leading / anchor. | |
| """ | |
| if not patterns: | |
| return True | |
| posix_path = _normalize_posix(path, repo_root) | |
| ppath = PurePosixPath(posix_path) | |
| include = True | |
| for raw in patterns: | |
| negated = raw.startswith('!') | |
| pat = raw[1:] if negated else raw | |
| glob_pat, _ = _pattern_to_glob(pat) | |
| if ppath.match(glob_pat): | |
| include = negated | |
| return include | |
| def is_probably_text_file(path: Path, sniff_bytes: int = 4096) -> bool: | |
| try: | |
| with path.open('rb') as f: | |
| chunk = f.read(sniff_bytes) | |
| if b"\x00" in chunk: | |
| return False | |
| # Try UTF-8 decode as heuristic; many source files are UTF-8 | |
| chunk.decode('utf-8') | |
| return True | |
| except Exception: | |
| return False | |
| def build_tree_lines(files: Sequence[Path], repo_root: Path) -> List[str]: | |
| # Build a simple tree representation from file paths | |
| rels = sorted([PurePosixPath(_normalize_posix(p, repo_root)) for p in files]) | |
| lines: List[str] = [] | |
| lines.append(".") | |
| prev_parts: List[str] = [] | |
| for rel in rels: | |
| parts = list(rel.parts) | |
| # Find common prefix with previous path to minimize repeated dirs | |
| common = 0 | |
| for a, b in zip(prev_parts, parts): | |
| if a == b: | |
| common += 1 | |
| else: | |
| break | |
| # Print the remaining parts with indentation | |
| for i in range(common, len(parts)): | |
| indent = " " * i | |
| name = parts[i] | |
| lines.append(f"{indent}{name}") | |
| prev_parts = parts | |
| return lines | |
| def write_markdown( | |
| output_path: Path, | |
| repo_root: Path, | |
| files: Sequence[Path], | |
| max_bytes: int, | |
| include_binaries: bool, | |
| ) -> None: | |
| md_lines: List[str] = [] | |
| # Header and tree | |
| md_lines.append("# Repository Snapshot") | |
| md_lines.append("") | |
| md_lines.append("File tree:") | |
| md_lines.append("") | |
| md_lines.append("```") | |
| md_lines.extend(build_tree_lines(files, repo_root)) | |
| md_lines.append("```") | |
| md_lines.append("") | |
| # File contents | |
| for path in sorted(files): | |
| rel = _normalize_posix(path, repo_root) | |
| md_lines.append(f"## {rel}") | |
| try: | |
| size = path.stat().st_size | |
| except FileNotFoundError: | |
| md_lines.append("") | |
| md_lines.append("_Skipped: file not found_") | |
| md_lines.append("") | |
| continue | |
| if not include_binaries and not is_probably_text_file(path): | |
| md_lines.append("") | |
| md_lines.append("_Skipped: binary file_") | |
| md_lines.append("") | |
| continue | |
| if size > max_bytes: | |
| md_lines.append("") | |
| md_lines.append(f"_Skipped: file larger than {max_bytes} bytes (size={size})_") | |
| md_lines.append("") | |
| continue | |
| try: | |
| content = path.read_text(encoding="utf-8", errors="replace") | |
| except Exception as e: | |
| md_lines.append("") | |
| md_lines.append(f"_Skipped: unable to read file ({e})_") | |
| md_lines.append("") | |
| continue | |
| md_lines.append("") | |
| md_lines.append("```") | |
| md_lines.append(content) | |
| md_lines.append("```") | |
| md_lines.append("") | |
| output_path.write_text("\n".join(md_lines), encoding="utf-8") | |
| def main(argv: Optional[Sequence[str]] = None) -> int: | |
| parser = argparse.ArgumentParser(description="Generate a Markdown snapshot of the repository.") | |
| parser.add_argument("--root", type=str, default=None, help="Repository root; defaults to git root or CWD") | |
| parser.add_argument("--output", type=str, default=None, help="Output markdown file path (default: REPO_SNAPSHOT.md at repo root)") | |
| parser.add_argument("--max-bytes", type=int, default=524288, help="Max file size to include in bytes (default: 524288)") | |
| parser.add_argument("--include-binaries", action="store_true", help="Include binary files (default: False)") | |
| args = parser.parse_args(argv) | |
| start_dir = Path(args.root) if args.root else Path.cwd() | |
| git_root = find_git_root(start_dir) | |
| repo_root = git_root or start_dir.resolve() | |
| files = list_repo_files_via_git(repo_root) or list_repo_files_fallback(repo_root) | |
| # Apply .dockerignore filtering | |
| docker_patterns = read_dockerignore(repo_root) | |
| files = [f for f in files if dockerignore_includes(f, repo_root, docker_patterns)] | |
| # Ensure output path | |
| output = Path(args.output) if args.output else (repo_root / "REPO_SNAPSHOT.md") | |
| # Avoid including the output file itself if it exists and is within the repo root | |
| files = [f for f in files if f.resolve() != output.resolve()] | |
| write_markdown(output, repo_root, files, args.max_bytes, args.include_binaries) | |
| print(str(output)) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script can help when you need to upload a codebase to an LLM to talk about it.