Created
November 18, 2025 20:01
-
-
Save jthemphill/d183b80c1442da934d1f743d9e90356c to your computer and use it in GitHub Desktop.
Find the oldest TODO in a Git codebase
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Find the oldest TODO comment that still exists in the codebase. | |
| This script traverses git history commit-by-commit, finds commits that add TODOs, | |
| and checks whether each TODO still exists in the current version of the code. | |
| """ | |
| import asyncio | |
| import re | |
| import sys | |
| from collections import namedtuple | |
| from typing import List, Optional, Set, Tuple | |
| TodoInfo = namedtuple( | |
| "TodoInfo", | |
| ["commit_hash", "commit_date", "file_path", "line_number", "todo_text", "author"], | |
| ) | |
| async def run_git_command(cmd: List[str]) -> str: | |
| """Run a git command asynchronously and return its output.""" | |
| try: | |
| process = await asyncio.create_subprocess_exec( | |
| "git", | |
| *cmd, | |
| stdout=asyncio.subprocess.PIPE, | |
| stderr=asyncio.subprocess.PIPE, | |
| cwd="/Users/jhemphill/dev/retool_development", | |
| ) | |
| stdout, stderr = await process.communicate() | |
| if process.returncode != 0: | |
| print(f"Error running git command: {' '.join(cmd)}", file=sys.stderr) | |
| print(f"Error: {stderr.decode()}", file=sys.stderr) | |
| sys.exit(1) | |
| return stdout.decode().strip() | |
| except Exception as e: | |
| print(f"Error running git command: {' '.join(cmd)}", file=sys.stderr) | |
| print(f"Error: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| async def get_current_todos() -> Set[str]: | |
| """Use RipGrep to fetch all TODO comments currently in the codebase.""" | |
| try: | |
| process = await asyncio.create_subprocess_exec( | |
| "rg", | |
| "-i", # Case-insensitive | |
| "TODO", # Pattern to match TODO comments | |
| stdout=asyncio.subprocess.PIPE, | |
| stderr=asyncio.subprocess.PIPE, | |
| cwd="/Users/jhemphill/dev/retool_development", | |
| ) | |
| stdout, stderr = await process.communicate() | |
| if ( | |
| process.returncode != 0 and process.returncode != 1 | |
| ): # Exit code 1 means no matches | |
| print(f"Error running ripgrep: {stderr.decode()}", file=sys.stderr) | |
| sys.exit(1) | |
| # Extract TODO signatures from ripgrep output | |
| current_todo_signatures: Set[str] = set() | |
| for line in stdout.decode().split("\n"): | |
| if not line.strip(): | |
| continue | |
| # RipGrep output format: file_path:line_number:content | |
| # Extract the content part (everything after the second colon) | |
| parts = line.split(":", 2) | |
| if len(parts) >= 3: | |
| todo_line = parts[2] | |
| signature = extract_todo_signature(todo_line) | |
| if signature: | |
| # Normalize for comparison | |
| normalized = " ".join(signature.split()).lower() | |
| current_todo_signatures.add(normalized) | |
| return current_todo_signatures | |
| except Exception as e: | |
| print(f"Error fetching current TODOs with ripgrep: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| async def get_all_commits() -> List[str]: | |
| """Get all commit hashes in chronological order (oldest first).""" | |
| output = await run_git_command(["log", "--reverse", "--format=%H"]) | |
| return output.split("\n") if output else [] | |
| async def get_commit_info(commit_hash: str) -> Tuple[str, str]: | |
| """Get commit date and author for a commit.""" | |
| date, author = await asyncio.gather( | |
| run_git_command(["log", "-1", "--format=%ai", commit_hash]), | |
| run_git_command(["log", "-1", "--format=%an", commit_hash]), | |
| ) | |
| return date, author | |
| def extract_todos_from_diff( | |
| diff_output: str, commit_hash: str, commit_date: str, author: str | |
| ) -> List[TodoInfo]: | |
| """Extract TODO comments that were added in a commit diff.""" | |
| todos = [] | |
| current_file = None | |
| current_line_offset = 0 | |
| # Pattern to match TODO comments (case-insensitive) | |
| todo_pattern = re.compile(r"TODO[:\s]", re.IGNORECASE) | |
| for line in diff_output.split("\n"): | |
| # Track file changes | |
| if line.startswith("+++ b/"): | |
| current_file = line[6:] # Remove '+++ b/' prefix | |
| current_line_offset = 0 | |
| elif line.startswith("@@"): | |
| # Parse line number from unified diff header: @@ -old_start,old_count +new_start,new_count @@ | |
| match = re.search(r"\+(\d+)", line) | |
| if match: | |
| current_line_offset = ( | |
| int(match.group(1)) - 1 | |
| ) # Will be incremented before use | |
| elif line.startswith("+") and not line.startswith("+++"): | |
| current_line_offset += 1 | |
| # Check if this added line contains a TODO | |
| if todo_pattern.search(line): | |
| # Extract the TODO text (remove the leading +) | |
| todo_text = line[1:] | |
| if current_file: | |
| todos.append( | |
| TodoInfo( | |
| commit_hash=commit_hash, | |
| commit_date=commit_date, | |
| file_path=current_file, | |
| line_number=current_line_offset, | |
| todo_text=todo_text, | |
| author=author, | |
| ) | |
| ) | |
| return todos | |
| def extract_todo_signature(todo_text: str) -> str: | |
| """Extract a unique signature for a TODO comment that can be matched even if indentation changes.""" | |
| # Find the TODO part and extract meaningful content | |
| match = re.search( | |
| r"TODO[:\s]+(.+?)(?:\n|$|//|#)", todo_text, re.IGNORECASE | re.DOTALL | |
| ) | |
| if match: | |
| # Get the TODO content, normalize whitespace | |
| content = " ".join(match.group(1).split()) | |
| return content.strip() | |
| # Fallback: extract everything after TODO | |
| match = re.search(r"TODO[:\s]*(.+?)(?:\n|$)", todo_text, re.IGNORECASE | re.DOTALL) | |
| if match: | |
| content = " ".join(match.group(1).split()) | |
| return content.strip() | |
| return todo_text.strip() | |
| def todo_exists_in_current_codebase( | |
| todo_info: TodoInfo, current_todos: Set[str] | |
| ) -> bool: | |
| """Check if a TODO still exists in the current codebase.""" | |
| todo_signature = extract_todo_signature(todo_info.todo_text) | |
| return todo_signature in current_todos | |
| async def process_commit(commit_hash: str) -> List[TodoInfo]: | |
| """Process a single commit to extract TODOs.""" | |
| # Get commit info and diff in parallel | |
| (commit_date, author), diff_output = await asyncio.gather( | |
| get_commit_info(commit_hash), | |
| run_git_command(["show", "--format=", commit_hash]), | |
| ) | |
| # Extract TODOs added in this commit | |
| return extract_todos_from_diff(diff_output, commit_hash, commit_date, author) | |
| def check_todo_exists(todo: TodoInfo, current_todos: Set[str]) -> Optional[TodoInfo]: | |
| """Check if a TODO exists and return it if it does.""" | |
| exists = todo_exists_in_current_codebase(todo, current_todos) | |
| return todo if exists else None | |
| async def find_oldest_todo( | |
| commits: List[str], current_todos: Set[str] | |
| ) -> Tuple[Optional[TodoInfo], int]: | |
| """Find the oldest TODO in the codebase.""" | |
| todos_checked = 0 | |
| processed_commits = 0 | |
| # Process commits in batches for better progress reporting | |
| batch_size = 100 | |
| for batch_start in range(0, len(commits), batch_size): | |
| batch_end = min(batch_start + batch_size, len(commits)) | |
| batch_commits = commits[batch_start:batch_end] | |
| # Process all commits in this batch concurrently | |
| commit_tasks = [process_commit(commit_hash) for commit_hash in batch_commits] | |
| commit_results = await asyncio.gather(*commit_tasks) | |
| # Collect all TODOs from this batch | |
| all_todos: list[TodoInfo] = [] | |
| for todos in commit_results: | |
| all_todos.extend(todos) | |
| # Check all TODOs concurrently | |
| for todo in all_todos: | |
| todo_exists = check_todo_exists(todo, current_todos) | |
| if not todo_exists: | |
| continue | |
| todos_checked += 1 | |
| if todo is None: | |
| continue | |
| print(f"\nFound oldest TODO still in codebase:") | |
| print(f" Commit: {todo.commit_hash[:8]} ({todo.commit_date})") | |
| print(f" Author: {todo.author}") | |
| print(f" File: {todo.file_path}:{todo.line_number}") | |
| print(f" TODO: {todo.todo_text[:100]}...") | |
| return todo, todos_checked | |
| processed_commits += len(batch_commits) | |
| if processed_commits % 100 == 0 or processed_commits == len(commits): | |
| print( | |
| f"Processed {processed_commits}/{len(commits)} commits...", | |
| file=sys.stderr, | |
| ) | |
| async def main(): | |
| """Main async function to find the oldest TODO.""" | |
| print("Finding oldest TODO in codebase...") | |
| print("Fetching all current TODOs from codebase...") | |
| [current_todos, commits] = await asyncio.gather( | |
| get_current_todos(), get_all_commits() | |
| ) | |
| print(f"Found {len(current_todos)} unique TODO comments in current codebase\n") | |
| print("Traversing git history to find when each TODO was added...") | |
| print("This may take a while...\n") | |
| print(f"Found {len(commits)} commits to check\n") | |
| [oldest_todo, todos_checked] = await find_oldest_todo(commits, current_todos) | |
| print(f"\n\nSummary:") | |
| print(f" Total commits checked: {len(commits)}") | |
| print(f" TODOs still in codebase: {todos_checked}") | |
| if oldest_todo: | |
| print(f"\n\nOLDEST TODO STILL IN CODEBASE:") | |
| print(f" Commit: {oldest_todo.commit_hash}") | |
| print(f" Date: {oldest_todo.commit_date}") | |
| print(f" Author: {oldest_todo.author}") | |
| print(f" File: {oldest_todo.file_path}:{oldest_todo.line_number}") | |
| print(f" TODO: {oldest_todo.todo_text}") | |
| # Show the commit message | |
| commit_msg = await run_git_command( | |
| ["log", "-1", "--format=%B", oldest_todo.commit_hash] | |
| ) | |
| print(f"\n Commit message:") | |
| for line in commit_msg.split("\n"): | |
| if line.strip(): | |
| print(f" {line}") | |
| else: | |
| print("\nNo TODOs found that still exist in the codebase.") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment