Skip to content

Instantly share code, notes, and snippets.

@rafaellehmkuhl
Created October 20, 2025 13:24
Show Gist options
  • Select an option

  • Save rafaellehmkuhl/f961126dec840642c7d673b7bd5f31f6 to your computer and use it in GitHub Desktop.

Select an option

Save rafaellehmkuhl/f961126dec840642c7d673b7bd5f31f6 to your computer and use it in GitHub Desktop.
Repo to Markdown Script
#!/usr/bin/env python3
"""
Generate a Markdown snapshot of the repository containing:
1) A file tree at the beginning
2) The name and content of all relevant files
Relevance is determined by ignoring anything ignored by .gitignore and .dockerignore.
Implementation details:
- Uses `git ls-files --cached --others --exclude-standard` to respect .gitignore
- Parses .dockerignore patterns (supports # comments, ! negations, *, **, ?, and directory patterns)
- Skips binary files and files larger than a configurable size limit by default
Usage:
python3 generate_repo_markdown.py --output REPO_SNAPSHOT.md
Options:
--root <path> Root of the repo (defaults to git root if available, else CWD)
--output <path> Output markdown file (default: REPO_SNAPSHOT.md at repo root)
--max-bytes <int> Max file size to include in bytes (default: 524288 = 512 KiB)
--include-binaries Include binary files (default: False)
"""
from __future__ import annotations
import argparse
import os
import sys
import subprocess
from pathlib import Path, PurePosixPath
from typing import Iterable, List, Optional, Sequence, Tuple
def run_command(args: Sequence[str], cwd: Optional[Path] = None) -> Tuple[int, str, str]:
try:
proc = subprocess.run(
args,
cwd=str(cwd) if cwd else None,
capture_output=True,
check=False,
)
stdout = proc.stdout.decode(errors="replace")
stderr = proc.stderr.decode(errors="replace")
return proc.returncode, stdout, stderr
except FileNotFoundError:
return 127, "", f"command not found: {args[0]}"
def find_git_root(start: Path) -> Optional[Path]:
current = start.resolve()
for _ in range(64):
if (current / ".git").exists():
return current
if current.parent == current:
return None
current = current.parent
return None
def list_repo_files_via_git(repo_root: Path) -> Optional[List[Path]]:
code, out, _ = run_command(["git", "ls-files", "--cached", "--others", "--exclude-standard", "-z"], cwd=repo_root)
if code != 0:
return None
# git -z separates with NUL
rel_paths = [p for p in out.split("\x00") if p]
return [repo_root / p for p in rel_paths]
def list_repo_files_fallback(repo_root: Path) -> List[Path]:
files: List[Path] = []
for root, dirs, filenames in os.walk(repo_root):
# Basic exclusions to avoid huge noise when git is not available
rel_root = Path(root).relative_to(repo_root)
if str(rel_root).startswith(".git"):
continue
if "__pycache__" in rel_root.parts:
continue
for name in filenames:
files.append(Path(root) / name)
return files
def read_dockerignore(repo_root: Path) -> List[str]:
path = repo_root / ".dockerignore"
if not path.exists():
return []
try:
content = path.read_text(encoding="utf-8", errors="replace")
except Exception:
return []
lines: List[str] = []
for line in content.splitlines():
s = line.strip()
if not s or s.startswith('#'):
continue
lines.append(s)
return lines
def _normalize_posix(path: Path, root: Path) -> str:
rel = path.relative_to(root)
# Normalize to POSIX-style for matching
return str(PurePosixPath(rel.as_posix()))
def _pattern_to_glob(pattern: str) -> Tuple[str, bool]:
"""
Convert a .dockerignore-like pattern to a glob usable by PurePosixPath.match.
Returns (glob, directory_only)
"""
directory_only = pattern.endswith('/')
pat = pattern[:-1] if directory_only else pattern
anchored = pat.startswith('/')
if anchored:
pat = pat[1:]
# If the pattern does not contain a slash, allow it to match anywhere in the path
if not anchored and '/' not in pat:
pat = f"**/{pat}"
# For directory-only patterns, match everything under that directory
if directory_only:
pat = f"{pat}/**"
return pat, directory_only
def dockerignore_includes(path: Path, repo_root: Path, patterns: Sequence[str]) -> bool:
"""
Apply .dockerignore semantics (approximate): last match wins, default include.
Supports: comments (already removed), !negation, *, **, ?, trailing / for dirs, leading / anchor.
"""
if not patterns:
return True
posix_path = _normalize_posix(path, repo_root)
ppath = PurePosixPath(posix_path)
include = True
for raw in patterns:
negated = raw.startswith('!')
pat = raw[1:] if negated else raw
glob_pat, _ = _pattern_to_glob(pat)
if ppath.match(glob_pat):
include = negated
return include
def is_probably_text_file(path: Path, sniff_bytes: int = 4096) -> bool:
try:
with path.open('rb') as f:
chunk = f.read(sniff_bytes)
if b"\x00" in chunk:
return False
# Try UTF-8 decode as heuristic; many source files are UTF-8
chunk.decode('utf-8')
return True
except Exception:
return False
def build_tree_lines(files: Sequence[Path], repo_root: Path) -> List[str]:
# Build a simple tree representation from file paths
rels = sorted([PurePosixPath(_normalize_posix(p, repo_root)) for p in files])
lines: List[str] = []
lines.append(".")
prev_parts: List[str] = []
for rel in rels:
parts = list(rel.parts)
# Find common prefix with previous path to minimize repeated dirs
common = 0
for a, b in zip(prev_parts, parts):
if a == b:
common += 1
else:
break
# Print the remaining parts with indentation
for i in range(common, len(parts)):
indent = " " * i
name = parts[i]
lines.append(f"{indent}{name}")
prev_parts = parts
return lines
def write_markdown(
output_path: Path,
repo_root: Path,
files: Sequence[Path],
max_bytes: int,
include_binaries: bool,
) -> None:
md_lines: List[str] = []
# Header and tree
md_lines.append("# Repository Snapshot")
md_lines.append("")
md_lines.append("File tree:")
md_lines.append("")
md_lines.append("```")
md_lines.extend(build_tree_lines(files, repo_root))
md_lines.append("```")
md_lines.append("")
# File contents
for path in sorted(files):
rel = _normalize_posix(path, repo_root)
md_lines.append(f"## {rel}")
try:
size = path.stat().st_size
except FileNotFoundError:
md_lines.append("")
md_lines.append("_Skipped: file not found_")
md_lines.append("")
continue
if not include_binaries and not is_probably_text_file(path):
md_lines.append("")
md_lines.append("_Skipped: binary file_")
md_lines.append("")
continue
if size > max_bytes:
md_lines.append("")
md_lines.append(f"_Skipped: file larger than {max_bytes} bytes (size={size})_")
md_lines.append("")
continue
try:
content = path.read_text(encoding="utf-8", errors="replace")
except Exception as e:
md_lines.append("")
md_lines.append(f"_Skipped: unable to read file ({e})_")
md_lines.append("")
continue
md_lines.append("")
md_lines.append("```")
md_lines.append(content)
md_lines.append("```")
md_lines.append("")
output_path.write_text("\n".join(md_lines), encoding="utf-8")
def main(argv: Optional[Sequence[str]] = None) -> int:
parser = argparse.ArgumentParser(description="Generate a Markdown snapshot of the repository.")
parser.add_argument("--root", type=str, default=None, help="Repository root; defaults to git root or CWD")
parser.add_argument("--output", type=str, default=None, help="Output markdown file path (default: REPO_SNAPSHOT.md at repo root)")
parser.add_argument("--max-bytes", type=int, default=524288, help="Max file size to include in bytes (default: 524288)")
parser.add_argument("--include-binaries", action="store_true", help="Include binary files (default: False)")
args = parser.parse_args(argv)
start_dir = Path(args.root) if args.root else Path.cwd()
git_root = find_git_root(start_dir)
repo_root = git_root or start_dir.resolve()
files = list_repo_files_via_git(repo_root) or list_repo_files_fallback(repo_root)
# Apply .dockerignore filtering
docker_patterns = read_dockerignore(repo_root)
files = [f for f in files if dockerignore_includes(f, repo_root, docker_patterns)]
# Ensure output path
output = Path(args.output) if args.output else (repo_root / "REPO_SNAPSHOT.md")
# Avoid including the output file itself if it exists and is within the repo root
files = [f for f in files if f.resolve() != output.resolve()]
write_markdown(output, repo_root, files, args.max_bytes, args.include_binaries)
print(str(output))
return 0
if __name__ == "__main__":
sys.exit(main())
@rafaellehmkuhl
Copy link
Author

This script can help when you need to upload a codebase to an LLM to talk about it.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment