rafaellehmkuhl · October 20, 2025 13:24 · rafaellehmkuhl · Oct 20, 2025
diff --git a/generate_repo_markdown.py b/generate_repo_markdown.py
 #!/usr/bin/env python3
 """
 Generate a Markdown snapshot of the repository containing:
  1) A file tree at the beginning
  2) The name and content of all relevant files

 Relevance is determined by ignoring anything ignored by .gitignore and .dockerignore.
 Implementation details:
  - Uses `git ls-files --cached --others --exclude-standard` to respect .gitignore
  - Parses .dockerignore patterns (supports # comments, ! negations, *, **, ?, and directory patterns)
  - Skips binary files and files larger than a configurable size limit by default

 Usage:
  python3 generate_repo_markdown.py --output REPO_SNAPSHOT.md

 Options:
  --root <path>           Root of the repo (defaults to git root if available, else CWD)
  --output <path>         Output markdown file (default: REPO_SNAPSHOT.md at repo root)
  --max-bytes <int>       Max file size to include in bytes (default: 524288 = 512 KiB)
  --include-binaries      Include binary files (default: False)
 """

 from __future__ import annotations

 import argparse
 import os
 import sys
 import subprocess
 from pathlib import Path, PurePosixPath
 from typing import Iterable, List, Optional, Sequence, Tuple


 def run_command(args: Sequence[str], cwd: Optional[Path] = None) -> Tuple[int, str, str]:
    try:
        proc = subprocess.run(
            args,
            cwd=str(cwd) if cwd else None,
            capture_output=True,
            check=False,
        )
        stdout = proc.stdout.decode(errors="replace")
        stderr = proc.stderr.decode(errors="replace")
        return proc.returncode, stdout, stderr
    except FileNotFoundError:
        return 127, "", f"command not found: {args[0]}"


 def find_git_root(start: Path) -> Optional[Path]:
    current = start.resolve()
    for _ in range(64):
        if (current / ".git").exists():
            return current
        if current.parent == current:
            return None
        current = current.parent
    return None


 def list_repo_files_via_git(repo_root: Path) -> Optional[List[Path]]:
    code, out, _ = run_command(["git", "ls-files", "--cached", "--others", "--exclude-standard", "-z"], cwd=repo_root)
    if code != 0:
        return None
    # git -z separates with NUL
    rel_paths = [p for p in out.split("\x00") if p]
    return [repo_root / p for p in rel_paths]


 def list_repo_files_fallback(repo_root: Path) -> List[Path]:
    files: List[Path] = []
    for root, dirs, filenames in os.walk(repo_root):
        # Basic exclusions to avoid huge noise when git is not available
        rel_root = Path(root).relative_to(repo_root)
        if str(rel_root).startswith(".git"):
            continue
        if "__pycache__" in rel_root.parts:
            continue
        for name in filenames:
            files.append(Path(root) / name)
    return files


 def read_dockerignore(repo_root: Path) -> List[str]:
    path = repo_root / ".dockerignore"
    if not path.exists():
        return []
    try:
        content = path.read_text(encoding="utf-8", errors="replace")
    except Exception:
        return []
    lines: List[str] = []
    for line in content.splitlines():
        s = line.strip()
        if not s or s.startswith('#'):
            continue
        lines.append(s)
    return lines


 def _normalize_posix(path: Path, root: Path) -> str:
    rel = path.relative_to(root)
    # Normalize to POSIX-style for matching
    return str(PurePosixPath(rel.as_posix()))


 def _pattern_to_glob(pattern: str) -> Tuple[str, bool]:
    """
    Convert a .dockerignore-like pattern to a glob usable by PurePosixPath.match.
    Returns (glob, directory_only)
    """
    directory_only = pattern.endswith('/')
    pat = pattern[:-1] if directory_only else pattern

    anchored = pat.startswith('/')
    if anchored:
        pat = pat[1:]

    # If the pattern does not contain a slash, allow it to match anywhere in the path
    if not anchored and '/' not in pat:
        pat = f"**/{pat}"

    # For directory-only patterns, match everything under that directory
    if directory_only:
        pat = f"{pat}/**"

    return pat, directory_only


 def dockerignore_includes(path: Path, repo_root: Path, patterns: Sequence[str]) -> bool:
    """
    Apply .dockerignore semantics (approximate): last match wins, default include.
    Supports: comments (already removed), !negation, *, **, ?, trailing / for dirs, leading / anchor.
    """
    if not patterns:
        return True

    posix_path = _normalize_posix(path, repo_root)
    ppath = PurePosixPath(posix_path)

    include = True
    for raw in patterns:
        negated = raw.startswith('!')
        pat = raw[1:] if negated else raw
        glob_pat, _ = _pattern_to_glob(pat)
        if ppath.match(glob_pat):
            include = negated
    return include


 def is_probably_text_file(path: Path, sniff_bytes: int = 4096) -> bool:
    try:
        with path.open('rb') as f:
            chunk = f.read(sniff_bytes)
        if b"\x00" in chunk:
            return False
        # Try UTF-8 decode as heuristic; many source files are UTF-8
        chunk.decode('utf-8')
        return True
    except Exception:
        return False


 def build_tree_lines(files: Sequence[Path], repo_root: Path) -> List[str]:
    # Build a simple tree representation from file paths
    rels = sorted([PurePosixPath(_normalize_posix(p, repo_root)) for p in files])
    lines: List[str] = []
    lines.append(".")
    prev_parts: List[str] = []
    for rel in rels:
        parts = list(rel.parts)
        # Find common prefix with previous path to minimize repeated dirs
        common = 0
        for a, b in zip(prev_parts, parts):
            if a == b:
                common += 1
            else:
                break
        # Print the remaining parts with indentation
        for i in range(common, len(parts)):
            indent = "  " * i
            name = parts[i]
            lines.append(f"{indent}{name}")
        prev_parts = parts
    return lines


 def write_markdown(
    output_path: Path,
    repo_root: Path,
    files: Sequence[Path],
    max_bytes: int,
    include_binaries: bool,
 ) -> None:
    md_lines: List[str] = []

    # Header and tree
    md_lines.append("# Repository Snapshot")
    md_lines.append("")
    md_lines.append("File tree:")
    md_lines.append("")
    md_lines.append("```")
    md_lines.extend(build_tree_lines(files, repo_root))
    md_lines.append("```")
    md_lines.append("")

    # File contents
    for path in sorted(files):
        rel = _normalize_posix(path, repo_root)
        md_lines.append(f"## {rel}")
        try:
            size = path.stat().st_size
        except FileNotFoundError:
            md_lines.append("")
            md_lines.append("_Skipped: file not found_")
            md_lines.append("")
            continue

        if not include_binaries and not is_probably_text_file(path):
            md_lines.append("")
            md_lines.append("_Skipped: binary file_")
            md_lines.append("")
            continue

        if size > max_bytes:
            md_lines.append("")
            md_lines.append(f"_Skipped: file larger than {max_bytes} bytes (size={size})_")
            md_lines.append("")
            continue

        try:
            content = path.read_text(encoding="utf-8", errors="replace")
        except Exception as e:
            md_lines.append("")
            md_lines.append(f"_Skipped: unable to read file ({e})_")
            md_lines.append("")
            continue

        md_lines.append("")
        md_lines.append("```")
        md_lines.append(content)
        md_lines.append("```")
        md_lines.append("")

    output_path.write_text("\n".join(md_lines), encoding="utf-8")


 def main(argv: Optional[Sequence[str]] = None) -> int:
    parser = argparse.ArgumentParser(description="Generate a Markdown snapshot of the repository.")
    parser.add_argument("--root", type=str, default=None, help="Repository root; defaults to git root or CWD")
    parser.add_argument("--output", type=str, default=None, help="Output markdown file path (default: REPO_SNAPSHOT.md at repo root)")
    parser.add_argument("--max-bytes", type=int, default=524288, help="Max file size to include in bytes (default: 524288)")
    parser.add_argument("--include-binaries", action="store_true", help="Include binary files (default: False)")

    args = parser.parse_args(argv)

    start_dir = Path(args.root) if args.root else Path.cwd()
    git_root = find_git_root(start_dir)
    repo_root = git_root or start_dir.resolve()

    files = list_repo_files_via_git(repo_root) or list_repo_files_fallback(repo_root)

    # Apply .dockerignore filtering
    docker_patterns = read_dockerignore(repo_root)
    files = [f for f in files if dockerignore_includes(f, repo_root, docker_patterns)]

    # Ensure output path
    output = Path(args.output) if args.output else (repo_root / "REPO_SNAPSHOT.md")
    # Avoid including the output file itself if it exists and is within the repo root
    files = [f for f in files if f.resolve() != output.resolve()]

    write_markdown(output, repo_root, files, args.max_bytes, args.include_binaries)

    print(str(output))
    return 0


 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python3
	"""
	Generate a Markdown snapshot of the repository containing:
	1) A file tree at the beginning
	2) The name and content of all relevant files

	Relevance is determined by ignoring anything ignored by .gitignore and .dockerignore.
	Implementation details:
	- Uses `git ls-files --cached --others --exclude-standard` to respect .gitignore
	- Parses .dockerignore patterns (supports # comments, ! negations, , *, ?, and directory patterns)
	- Skips binary files and files larger than a configurable size limit by default

	Usage:
	python3 generate_repo_markdown.py --output REPO_SNAPSHOT.md

	Options:
	--root <path> Root of the repo (defaults to git root if available, else CWD)
	--output <path> Output markdown file (default: REPO_SNAPSHOT.md at repo root)
	--max-bytes <int> Max file size to include in bytes (default: 524288 = 512 KiB)
	--include-binaries Include binary files (default: False)
	"""

	from __future__ import annotations

	import argparse
	import os
	import sys
	import subprocess
	from pathlib import Path, PurePosixPath
	from typing import Iterable, List, Optional, Sequence, Tuple


	def run_command(args: Sequence[str], cwd: Optional[Path] = None) -> Tuple[int, str, str]:
	try:
	proc = subprocess.run(
	args,
	cwd=str(cwd) if cwd else None,
	capture_output=True,
	check=False,
	)
	stdout = proc.stdout.decode(errors="replace")
	stderr = proc.stderr.decode(errors="replace")
	return proc.returncode, stdout, stderr
	except FileNotFoundError:
	return 127, "", f"command not found: {args[0]}"


	def find_git_root(start: Path) -> Optional[Path]:
	current = start.resolve()
	for _ in range(64):
	if (current / ".git").exists():
	return current
	if current.parent == current:
	return None
	current = current.parent
	return None


	def list_repo_files_via_git(repo_root: Path) -> Optional[List[Path]]:
	code, out, _ = run_command(["git", "ls-files", "--cached", "--others", "--exclude-standard", "-z"], cwd=repo_root)
	if code != 0:
	return None
	# git -z separates with NUL
	rel_paths = [p for p in out.split("\x00") if p]
	return [repo_root / p for p in rel_paths]


	def list_repo_files_fallback(repo_root: Path) -> List[Path]:
	files: List[Path] = []
	for root, dirs, filenames in os.walk(repo_root):
	# Basic exclusions to avoid huge noise when git is not available
	rel_root = Path(root).relative_to(repo_root)
	if str(rel_root).startswith(".git"):
	continue
	if "__pycache__" in rel_root.parts:
	continue
	for name in filenames:
	files.append(Path(root) / name)
	return files


	def read_dockerignore(repo_root: Path) -> List[str]:
	path = repo_root / ".dockerignore"
	if not path.exists():
	return []
	try:
	content = path.read_text(encoding="utf-8", errors="replace")
	except Exception:
	return []
	lines: List[str] = []
	for line in content.splitlines():
	s = line.strip()
	if not s or s.startswith('#'):
	continue
	lines.append(s)
	return lines


	def _normalize_posix(path: Path, root: Path) -> str:
	rel = path.relative_to(root)
	# Normalize to POSIX-style for matching
	return str(PurePosixPath(rel.as_posix()))


	def _pattern_to_glob(pattern: str) -> Tuple[str, bool]:
	"""
	Convert a .dockerignore-like pattern to a glob usable by PurePosixPath.match.
	Returns (glob, directory_only)
	"""
	directory_only = pattern.endswith('/')
	pat = pattern[:-1] if directory_only else pattern

	anchored = pat.startswith('/')
	if anchored:
	pat = pat[1:]

	# If the pattern does not contain a slash, allow it to match anywhere in the path
	if not anchored and '/' not in pat:
	pat = f"**/{pat}"

	# For directory-only patterns, match everything under that directory
	if directory_only:
	pat = f"{pat}/**"

	return pat, directory_only


	def dockerignore_includes(path: Path, repo_root: Path, patterns: Sequence[str]) -> bool:
	"""
	Apply .dockerignore semantics (approximate): last match wins, default include.
	Supports: comments (already removed), !negation, , *, ?, trailing / for dirs, leading / anchor.
	"""
	if not patterns:
	return True

	posix_path = _normalize_posix(path, repo_root)
	ppath = PurePosixPath(posix_path)

	include = True
	for raw in patterns:
	negated = raw.startswith('!')
	pat = raw[1:] if negated else raw
	glob_pat, _ = _pattern_to_glob(pat)
	if ppath.match(glob_pat):
	include = negated
	return include


	def is_probably_text_file(path: Path, sniff_bytes: int = 4096) -> bool:
	try:
	with path.open('rb') as f:
	chunk = f.read(sniff_bytes)
	if b"\x00" in chunk:
	return False
	# Try UTF-8 decode as heuristic; many source files are UTF-8
	chunk.decode('utf-8')
	return True
	except Exception:
	return False


	def build_tree_lines(files: Sequence[Path], repo_root: Path) -> List[str]:
	# Build a simple tree representation from file paths
	rels = sorted([PurePosixPath(_normalize_posix(p, repo_root)) for p in files])
	lines: List[str] = []
	lines.append(".")
	prev_parts: List[str] = []
	for rel in rels:
	parts = list(rel.parts)
	# Find common prefix with previous path to minimize repeated dirs
	common = 0
	for a, b in zip(prev_parts, parts):
	if a == b:
	common += 1
	else:
	break
	# Print the remaining parts with indentation
	for i in range(common, len(parts)):
	indent = " " * i
	name = parts[i]
	lines.append(f"{indent}{name}")
	prev_parts = parts
	return lines


	def write_markdown(
	output_path: Path,
	repo_root: Path,
	files: Sequence[Path],
	max_bytes: int,
	include_binaries: bool,
	) -> None:
	md_lines: List[str] = []

	# Header and tree
	md_lines.append("# Repository Snapshot")
	md_lines.append("")
	md_lines.append("File tree:")
	md_lines.append("")
	md_lines.append("```")
	md_lines.extend(build_tree_lines(files, repo_root))
	md_lines.append("```")
	md_lines.append("")

	# File contents
	for path in sorted(files):
	rel = _normalize_posix(path, repo_root)
	md_lines.append(f"## {rel}")
	try:
	size = path.stat().st_size
	except FileNotFoundError:
	md_lines.append("")
	md_lines.append("_Skipped: file not found_")
	md_lines.append("")
	continue

	if not include_binaries and not is_probably_text_file(path):
	md_lines.append("")
	md_lines.append("_Skipped: binary file_")
	md_lines.append("")
	continue

	if size > max_bytes:
	md_lines.append("")
	md_lines.append(f"_Skipped: file larger than {max_bytes} bytes (size={size})_")
	md_lines.append("")
	continue

	try:
	content = path.read_text(encoding="utf-8", errors="replace")
	except Exception as e:
	md_lines.append("")
	md_lines.append(f"_Skipped: unable to read file ({e})_")
	md_lines.append("")
	continue

	md_lines.append("")
	md_lines.append("```")
	md_lines.append(content)
	md_lines.append("```")
	md_lines.append("")

	output_path.write_text("\n".join(md_lines), encoding="utf-8")


	def main(argv: Optional[Sequence[str]] = None) -> int:
	parser = argparse.ArgumentParser(description="Generate a Markdown snapshot of the repository.")
	parser.add_argument("--root", type=str, default=None, help="Repository root; defaults to git root or CWD")
	parser.add_argument("--output", type=str, default=None, help="Output markdown file path (default: REPO_SNAPSHOT.md at repo root)")
	parser.add_argument("--max-bytes", type=int, default=524288, help="Max file size to include in bytes (default: 524288)")
	parser.add_argument("--include-binaries", action="store_true", help="Include binary files (default: False)")

	args = parser.parse_args(argv)

	start_dir = Path(args.root) if args.root else Path.cwd()
	git_root = find_git_root(start_dir)
	repo_root = git_root or start_dir.resolve()

	files = list_repo_files_via_git(repo_root) or list_repo_files_fallback(repo_root)

	# Apply .dockerignore filtering
	docker_patterns = read_dockerignore(repo_root)
	files = [f for f in files if dockerignore_includes(f, repo_root, docker_patterns)]

	# Ensure output path
	output = Path(args.output) if args.output else (repo_root / "REPO_SNAPSHOT.md")
	# Avoid including the output file itself if it exists and is within the repo root
	files = [f for f in files if f.resolve() != output.resolve()]

	write_markdown(output, repo_root, files, args.max_bytes, args.include_binaries)

	print(str(output))
	return 0


	if __name__ == "__main__":
	sys.exit(main())
No results found