Skip to content

Instantly share code, notes, and snippets.

@jthemphill
Created November 18, 2025 20:01
Show Gist options
  • Select an option

  • Save jthemphill/d183b80c1442da934d1f743d9e90356c to your computer and use it in GitHub Desktop.

Select an option

Save jthemphill/d183b80c1442da934d1f743d9e90356c to your computer and use it in GitHub Desktop.
Find the oldest TODO in a Git codebase
#!/usr/bin/env python3
"""
Find the oldest TODO comment that still exists in the codebase.
This script traverses git history commit-by-commit, finds commits that add TODOs,
and checks whether each TODO still exists in the current version of the code.
"""
import asyncio
import re
import sys
from collections import namedtuple
from typing import List, Optional, Set, Tuple
TodoInfo = namedtuple(
"TodoInfo",
["commit_hash", "commit_date", "file_path", "line_number", "todo_text", "author"],
)
async def run_git_command(cmd: List[str]) -> str:
"""Run a git command asynchronously and return its output."""
try:
process = await asyncio.create_subprocess_exec(
"git",
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd="/Users/jhemphill/dev/retool_development",
)
stdout, stderr = await process.communicate()
if process.returncode != 0:
print(f"Error running git command: {' '.join(cmd)}", file=sys.stderr)
print(f"Error: {stderr.decode()}", file=sys.stderr)
sys.exit(1)
return stdout.decode().strip()
except Exception as e:
print(f"Error running git command: {' '.join(cmd)}", file=sys.stderr)
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
async def get_current_todos() -> Set[str]:
"""Use RipGrep to fetch all TODO comments currently in the codebase."""
try:
process = await asyncio.create_subprocess_exec(
"rg",
"-i", # Case-insensitive
"TODO", # Pattern to match TODO comments
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
cwd="/Users/jhemphill/dev/retool_development",
)
stdout, stderr = await process.communicate()
if (
process.returncode != 0 and process.returncode != 1
): # Exit code 1 means no matches
print(f"Error running ripgrep: {stderr.decode()}", file=sys.stderr)
sys.exit(1)
# Extract TODO signatures from ripgrep output
current_todo_signatures: Set[str] = set()
for line in stdout.decode().split("\n"):
if not line.strip():
continue
# RipGrep output format: file_path:line_number:content
# Extract the content part (everything after the second colon)
parts = line.split(":", 2)
if len(parts) >= 3:
todo_line = parts[2]
signature = extract_todo_signature(todo_line)
if signature:
# Normalize for comparison
normalized = " ".join(signature.split()).lower()
current_todo_signatures.add(normalized)
return current_todo_signatures
except Exception as e:
print(f"Error fetching current TODOs with ripgrep: {e}", file=sys.stderr)
sys.exit(1)
async def get_all_commits() -> List[str]:
"""Get all commit hashes in chronological order (oldest first)."""
output = await run_git_command(["log", "--reverse", "--format=%H"])
return output.split("\n") if output else []
async def get_commit_info(commit_hash: str) -> Tuple[str, str]:
"""Get commit date and author for a commit."""
date, author = await asyncio.gather(
run_git_command(["log", "-1", "--format=%ai", commit_hash]),
run_git_command(["log", "-1", "--format=%an", commit_hash]),
)
return date, author
def extract_todos_from_diff(
diff_output: str, commit_hash: str, commit_date: str, author: str
) -> List[TodoInfo]:
"""Extract TODO comments that were added in a commit diff."""
todos = []
current_file = None
current_line_offset = 0
# Pattern to match TODO comments (case-insensitive)
todo_pattern = re.compile(r"TODO[:\s]", re.IGNORECASE)
for line in diff_output.split("\n"):
# Track file changes
if line.startswith("+++ b/"):
current_file = line[6:] # Remove '+++ b/' prefix
current_line_offset = 0
elif line.startswith("@@"):
# Parse line number from unified diff header: @@ -old_start,old_count +new_start,new_count @@
match = re.search(r"\+(\d+)", line)
if match:
current_line_offset = (
int(match.group(1)) - 1
) # Will be incremented before use
elif line.startswith("+") and not line.startswith("+++"):
current_line_offset += 1
# Check if this added line contains a TODO
if todo_pattern.search(line):
# Extract the TODO text (remove the leading +)
todo_text = line[1:]
if current_file:
todos.append(
TodoInfo(
commit_hash=commit_hash,
commit_date=commit_date,
file_path=current_file,
line_number=current_line_offset,
todo_text=todo_text,
author=author,
)
)
return todos
def extract_todo_signature(todo_text: str) -> str:
"""Extract a unique signature for a TODO comment that can be matched even if indentation changes."""
# Find the TODO part and extract meaningful content
match = re.search(
r"TODO[:\s]+(.+?)(?:\n|$|//|#)", todo_text, re.IGNORECASE | re.DOTALL
)
if match:
# Get the TODO content, normalize whitespace
content = " ".join(match.group(1).split())
return content.strip()
# Fallback: extract everything after TODO
match = re.search(r"TODO[:\s]*(.+?)(?:\n|$)", todo_text, re.IGNORECASE | re.DOTALL)
if match:
content = " ".join(match.group(1).split())
return content.strip()
return todo_text.strip()
def todo_exists_in_current_codebase(
todo_info: TodoInfo, current_todos: Set[str]
) -> bool:
"""Check if a TODO still exists in the current codebase."""
todo_signature = extract_todo_signature(todo_info.todo_text)
return todo_signature in current_todos
async def process_commit(commit_hash: str) -> List[TodoInfo]:
"""Process a single commit to extract TODOs."""
# Get commit info and diff in parallel
(commit_date, author), diff_output = await asyncio.gather(
get_commit_info(commit_hash),
run_git_command(["show", "--format=", commit_hash]),
)
# Extract TODOs added in this commit
return extract_todos_from_diff(diff_output, commit_hash, commit_date, author)
def check_todo_exists(todo: TodoInfo, current_todos: Set[str]) -> Optional[TodoInfo]:
"""Check if a TODO exists and return it if it does."""
exists = todo_exists_in_current_codebase(todo, current_todos)
return todo if exists else None
async def find_oldest_todo(
commits: List[str], current_todos: Set[str]
) -> Tuple[Optional[TodoInfo], int]:
"""Find the oldest TODO in the codebase."""
todos_checked = 0
processed_commits = 0
# Process commits in batches for better progress reporting
batch_size = 100
for batch_start in range(0, len(commits), batch_size):
batch_end = min(batch_start + batch_size, len(commits))
batch_commits = commits[batch_start:batch_end]
# Process all commits in this batch concurrently
commit_tasks = [process_commit(commit_hash) for commit_hash in batch_commits]
commit_results = await asyncio.gather(*commit_tasks)
# Collect all TODOs from this batch
all_todos: list[TodoInfo] = []
for todos in commit_results:
all_todos.extend(todos)
# Check all TODOs concurrently
for todo in all_todos:
todo_exists = check_todo_exists(todo, current_todos)
if not todo_exists:
continue
todos_checked += 1
if todo is None:
continue
print(f"\nFound oldest TODO still in codebase:")
print(f" Commit: {todo.commit_hash[:8]} ({todo.commit_date})")
print(f" Author: {todo.author}")
print(f" File: {todo.file_path}:{todo.line_number}")
print(f" TODO: {todo.todo_text[:100]}...")
return todo, todos_checked
processed_commits += len(batch_commits)
if processed_commits % 100 == 0 or processed_commits == len(commits):
print(
f"Processed {processed_commits}/{len(commits)} commits...",
file=sys.stderr,
)
async def main():
"""Main async function to find the oldest TODO."""
print("Finding oldest TODO in codebase...")
print("Fetching all current TODOs from codebase...")
[current_todos, commits] = await asyncio.gather(
get_current_todos(), get_all_commits()
)
print(f"Found {len(current_todos)} unique TODO comments in current codebase\n")
print("Traversing git history to find when each TODO was added...")
print("This may take a while...\n")
print(f"Found {len(commits)} commits to check\n")
[oldest_todo, todos_checked] = await find_oldest_todo(commits, current_todos)
print(f"\n\nSummary:")
print(f" Total commits checked: {len(commits)}")
print(f" TODOs still in codebase: {todos_checked}")
if oldest_todo:
print(f"\n\nOLDEST TODO STILL IN CODEBASE:")
print(f" Commit: {oldest_todo.commit_hash}")
print(f" Date: {oldest_todo.commit_date}")
print(f" Author: {oldest_todo.author}")
print(f" File: {oldest_todo.file_path}:{oldest_todo.line_number}")
print(f" TODO: {oldest_todo.todo_text}")
# Show the commit message
commit_msg = await run_git_command(
["log", "-1", "--format=%B", oldest_todo.commit_hash]
)
print(f"\n Commit message:")
for line in commit_msg.split("\n"):
if line.strip():
print(f" {line}")
else:
print("\nNo TODOs found that still exist in the codebase.")
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment