Skip to content

Instantly share code, notes, and snippets.

@davydmaker
Created September 23, 2025 00:07
Show Gist options
  • Select an option

  • Save davydmaker/b14acf98e13c58f9cc679fad9686e105 to your computer and use it in GitHub Desktop.

Select an option

Save davydmaker/b14acf98e13c58f9cc679fad9686e105 to your computer and use it in GitHub Desktop.
Directory diff tool with interactive reporting and complete analysis
#!/usr/bin/env python3
"""
Script to recursively compare all files between two directories.
Generates text reports by default, with option for HTML format.
Usage:
python directory_diff.py <base> <compare> [-o output] [--format {text,html}]
Examples:
python directory_diff.py base compare # Text report (default)
python directory_diff.py base compare -o my-report # Save as my-report.txt
python directory_diff.py base compare --format html # HTML report
python directory_diff.py base compare --format html -o my-report # Save as my-report.html
"""
import os
import sys
import hashlib
import difflib
from pathlib import Path
from typing import Dict, Set, Tuple, List
import argparse
from datetime import datetime
class DirectoryComparator:
"""
Class to compare files and directories recursively.
Handles file content comparison and report generation.
"""
def __init__(self, base: str, compare: str, ignore_patterns: Set[str] = None):
"""
Initialize comparator with base and compare directories and ignore patterns.
Args:
base: Base directory path (reference)
compare: Compare directory path (candidate)
ignore_patterns: Set of additional patterns to ignore (combined with defaults)
"""
self.base = Path(base).resolve()
self.compare = Path(compare).resolve()
# Default patterns that are always ignored
default_patterns = {
'.git', 'node_modules', 'logs',
'*.log', '.env', 'dist', 'build',
'.vscode', '.idea', '*.tmp', '*.temp'
}
# Combine default patterns with user-provided patterns
if ignore_patterns:
self.ignore_patterns = default_patterns | ignore_patterns
else:
self.ignore_patterns = default_patterns
def should_ignore(self, path: Path) -> bool:
"""
Check if a file/directory should be ignored based on patterns.
Args:
path: Path to check
Returns:
bool: True if path matches ignore pattern
"""
for pattern in self.ignore_patterns:
if pattern.startswith('*.'):
if path.name.endswith(pattern[1:]):
return True
else:
if pattern in str(path) or path.name == pattern:
return True
return False
def get_file_hash(self, file_path: Path) -> str:
"""
Calculate MD5 hash of a file.
Args:
file_path: Path to file
Returns:
str: MD5 hash or error message
"""
try:
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except Exception as e:
return f"ERROR: {str(e)}"
def get_all_files(self, directory: Path) -> Dict[str, Path]:
"""
Get all files recursively in a directory.
Args:
directory: Root directory path
Returns:
Dict mapping relative paths to absolute paths
"""
files = {}
try:
for item in directory.rglob('*'):
if item.is_file() and not self.should_ignore(item):
relative_path = item.relative_to(directory)
files[str(relative_path)] = item
except Exception as e:
print(f"Error reading directory {directory}: {e}")
return files
def compare_files_content(self, file1: Path, file2: Path) -> Tuple[bool, str]:
"""
Compare content of two files.
Args:
file1: First file path
file2: Second file path
Returns:
Tuple of (is_same, diff_info)
"""
try:
hash1 = self.get_file_hash(file1)
hash2 = self.get_file_hash(file2)
if hash1.startswith("ERROR") or hash2.startswith("ERROR"):
return False, f"Error reading files: {hash1}, {hash2}"
if hash1 == hash2:
return True, "Files are identical"
try:
with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
base_name = self.base.name
compare_name = self.compare.name
rel_path1 = file1.relative_to(self.base)
rel_path2 = file2.relative_to(self.compare)
diff = list(difflib.unified_diff(
lines1, lines2,
fromfile=f"{base_name}/{rel_path1}",
tofile=f"{compare_name}/{rel_path2}",
lineterm=''
))
if diff:
return False, '\n'.join(diff)
else:
return True, "Files are identical"
except UnicodeDecodeError:
return False, "Binary files differ"
except Exception as e:
return False, f"Error comparing files: {str(e)}"
def _get_comparison_data(self):
"""
Get comparison data for report generation.
Returns:
Tuple containing (only_in_base, only_in_compare, different_files, identical_files)
"""
print("Starting directory comparison...")
print(f"Base: {self.base}")
print(f"Compare: {self.compare}\n")
files1 = self.get_all_files(self.base)
files2 = self.get_all_files(self.compare)
all_files = set(files1.keys()) | set(files2.keys())
only_in_base = []
only_in_compare = []
different_files = []
identical_files = []
print(f"Total unique files found: {len(all_files)}")
print("Processing...\n")
for relative_path in sorted(all_files):
if relative_path in files1 and relative_path in files2:
file1 = files1[relative_path]
file2 = files2[relative_path]
is_same, diff_info = self.compare_files_content(file1, file2)
if is_same:
identical_files.append(relative_path)
else:
different_files.append({
'path': relative_path,
'file1': file1,
'file2': file2,
'diff': diff_info
})
elif relative_path in files1:
only_in_base.append(relative_path)
else:
only_in_compare.append(relative_path)
return only_in_base, only_in_compare, different_files, identical_files
def _count_diff_lines(self, diff_content: str) -> int:
"""Count meaningful diff lines (excluding headers)."""
lines = diff_content.split('\n')
meaningful_lines = 0
for line in lines:
if line.startswith(('+', '-')) and not line.startswith(('+++', '---')):
meaningful_lines += 1
return meaningful_lines
def _format_diff_for_html(self, diff_content: str) -> str:
"""Format diff content for HTML with syntax highlighting."""
lines = diff_content.split('\n')
formatted_lines = []
for line in lines:
if not line.strip():
continue
escaped_line = line.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
if line.startswith('+++') or line.startswith('---'):
formatted_lines.append(f'<div class="diff-line diff-line-header">{escaped_line}</div>')
elif line.startswith('@@'):
formatted_lines.append(f'<div class="diff-line diff-line-header">{escaped_line}</div>')
elif line.startswith('+'):
content = escaped_line[1:] if len(escaped_line) > 1 else ''
formatted_lines.append(f'<div class="diff-line diff-line-added">{content}</div>')
elif line.startswith('-'):
content = escaped_line[1:] if len(escaped_line) > 1 else ''
formatted_lines.append(f'<div class="diff-line diff-line-removed">{content}</div>')
elif line.startswith(' '):
content = escaped_line[1:] if len(escaped_line) > 1 else ''
formatted_lines.append(f'<div class="diff-line diff-line-context">{content}</div>')
elif line.strip():
formatted_lines.append(f'<div class="diff-line diff-line-context">{escaped_line}</div>')
return ''.join(formatted_lines)
def generate_text_report(self) -> str:
"""
Generate clean text comparison report.
Returns:
str: Formatted report text
"""
only_in_base, only_in_compare, different_files, identical_files = self._get_comparison_data()
report = []
report.append("="*80)
report.append("DIRECTORY COMPARISON REPORT")
report.append("="*80)
report.append(f"Date/Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append(f"Base: {self.base}")
report.append(f"Compare: {self.compare}")
report.append("")
report.append("SUMMARY:")
report.append(f" Identical files: {len(identical_files)}")
report.append(f" Different files: {len(different_files)}")
report.append(f" Files only in base: {len(only_in_base)}")
report.append(f" Files only in compare: {len(only_in_compare)}")
report.append("")
if only_in_base:
report.append("FILES ONLY IN BASE:")
report.append("-" * 50)
for file_path in sorted(only_in_base):
report.append(f" {file_path}")
report.append("")
if only_in_compare:
report.append("FILES ONLY IN COMPARE:")
report.append("-" * 50)
for file_path in sorted(only_in_compare):
report.append(f" {file_path}")
report.append("")
if different_files:
report.append("DIFFERENT FILES:")
report.append("-" * 50)
for file_info in different_files:
report.append(f"File: {file_info['path']}")
report.append(f" base: {file_info['file1']}")
report.append(f" compare: {file_info['file2']}")
diff_lines_count = self._count_diff_lines(file_info['diff'])
if diff_lines_count <= 100:
report.append(" Differences:")
diff_lines = file_info['diff'].split('\n')
for line in diff_lines:
if line.startswith(('@@', '---', '+++')):
report.append(f" {line}")
elif line.startswith(('+', '-')):
report.append(f" {line}")
elif line.strip() and not line.startswith(' '):
report.append(f" {line}")
else:
report.append(f" Differences: {diff_lines_count} lines (too long to display)")
report.append("")
report.append("="*80)
return '\n'.join(report)
def generate_html_report(self) -> str:
"""
Generate HTML comparison report.
Returns:
str: HTML formatted report
"""
only_in_base, only_in_compare, different_files, identical_files = self._get_comparison_data()
html = f"""<html>
<head>
<meta charset="UTF-8">
<title>Directory Comparison Report</title>
<style>
body {{
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
line-height: 1.6;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}}
.container {{
max-width: 1200px;
margin: 0 auto;
background: white;
padding: 30px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}}
h1 {{
color: #333;
border-bottom: 3px solid #007acc;
padding-bottom: 10px;
margin-bottom: 30px;
}}
h2 {{
color: #555;
margin-top: 30px;
border-left: 4px solid #007acc;
padding-left: 15px;
}}
.summary {{
background: #f8f9fa;
padding: 20px;
border-radius: 5px;
margin-bottom: 30px;
}}
.summary-item {{
display: inline-block;
margin: 10px 20px 10px 0;
padding: 10px 15px;
background: white;
border-radius: 5px;
border-left: 4px solid #007acc;
}}
.file-list {{
background: #f8f9fa;
padding: 15px;
border-radius: 5px;
margin: 15px 0;
}}
.file-item {{
margin: 5px 0;
padding: 8px;
background: white;
border-radius: 3px;
font-family: monospace;
}}
.diff-file {{
border-left: 4px solid #ffc107;
background: #fff3cd;
}}
.added-file {{
border-left: 4px solid #28a745;
background: #d4edda;
}}
.removed-file {{
border-left: 4px solid #dc3545;
background: #f8d7da;
}}
.diff-content {{
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 5px;
padding: 0;
margin: 10px 0;
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
font-size: 13px;
max-height: 400px;
overflow-y: auto;
line-height: 1.4;
}}
.diff-line {{
padding: 2px 10px;
margin: 0;
white-space: pre-wrap;
border-left: 3px solid transparent;
}}
.diff-line-added {{
background-color: #e6ffed;
border-left-color: #28a745;
color: #22863a;
}}
.diff-line-added::before {{
content: "+";
color: #28a745;
font-weight: bold;
margin-right: 8px;
}}
.diff-line-removed {{
background-color: #ffeef0;
border-left-color: #d73a49;
color: #b31d28;
}}
.diff-line-removed::before {{
content: "-";
color: #d73a49;
font-weight: bold;
margin-right: 8px;
}}
.diff-line-context {{
background-color: #f8f9fa;
color: #586069;
}}
.diff-line-context::before {{
content: " ";
margin-right: 8px;
}}
.diff-line-header {{
background-color: #f1f8ff;
color: #0366d6;
font-weight: bold;
border-left-color: #0366d6;
}}
.diff-line:hover {{
background-color: rgba(255, 255, 255, 0.1);
}}
.meta {{
color: #666;
font-size: 12px;
margin-bottom: 10px;
}}
.collapsible {{
cursor: pointer;
user-select: none;
}}
.collapsible:hover {{
background: #e9ecef;
}}
.content {{
display: none;
}}
.content.active {{
display: block;
}}
.toggle {{
float: right;
font-weight: bold;
}}
</style>
<script>
function toggleContent(element) {{
const content = element.nextElementSibling;
const toggle = element.querySelector('.toggle');
if (content.classList.contains('active')) {{
content.classList.remove('active');
toggle.textContent = '+';
}} else {{
content.classList.add('active');
toggle.textContent = '-';
}}
}}
</script>
</head>
<body>
<div class="container">
<h1>Directory Comparison Report</h1>
<div class="meta">
Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}<br>
Base: <code>{self.base}</code><br>
Compare: <code>{self.compare}</code>
</div>
<div class="summary">
<h2>Summary</h2>
<div class="summary-item">
<strong>{len(identical_files)}</strong><br>Identical files
</div>
<div class="summary-item">
<strong>{len(different_files)}</strong><br>Different files
</div>
<div class="summary-item">
<strong>{len(only_in_base)}</strong><br>Only in base
</div>
<div class="summary-item">
<strong>{len(only_in_compare)}</strong><br>Only in compare
</div>
</div>"""
if only_in_base:
html += f"""
<h2>Files Only in Base</h2>
<div class="file-list">"""
for file_path in sorted(only_in_base):
html += f'<div class="file-item removed-file">{file_path}</div>'
html += "</div>"
if only_in_compare:
html += f"""
<h2>Files Only in Compare</h2>
<div class="file-list">"""
for file_path in sorted(only_in_compare):
html += f'<div class="file-item added-file">{file_path}</div>'
html += "</div>"
if different_files:
html += """
<h2>Different Files</h2>
<div class="file-list">"""
for file_info in different_files:
diff_lines_count = self._count_diff_lines(file_info['diff'])
html += f"""
<div class="file-item diff-file">
<div class="collapsible" onclick="toggleContent(this)">
<strong>{file_info['path']}</strong>
<span class="toggle">+</span>
</div>
<div class="content">
<div class="meta">
Base: {file_info['file1']}<br>
Compare: {file_info['file2']}<br>
Changes: {diff_lines_count} lines
</div>"""
if diff_lines_count <= 100:
formatted_diff = self._format_diff_for_html(file_info['diff'])
if formatted_diff:
html += f'<div class="diff-content">{formatted_diff}</div>'
else:
html += f'<div class="diff-content"><div class="diff-line diff-line-context">Differences too long to display ({diff_lines_count} lines)</div></div>'
html += "</div></div>"
html += "</div>"
html += """
</div>
</body>
</html>"""
return html
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Recursively compare all files between two directories"
)
parser.add_argument("base", help="Base directory (reference, e.g. main branch)")
parser.add_argument("compare", help="Compare directory (candidate, e.g. dev branch)")
parser.add_argument("-o", "--output", help="Output file for report (without extension)")
parser.add_argument("--format", choices=["text", "html"], default="text",
help="Output format: text (default) or html")
parser.add_argument("--ignore", nargs="*", help="Additional patterns to ignore (combined with defaults)")
args = parser.parse_args()
if not os.path.exists(args.base):
print(f"Error: Directory '{args.base}' does not exist")
sys.exit(1)
if not os.path.exists(args.compare):
print(f"Error: Directory '{args.compare}' does not exist")
sys.exit(1)
ignore_patterns = None
if args.ignore:
ignore_patterns = set(args.ignore)
comparator = DirectoryComparator(args.base, args.compare, ignore_patterns)
if args.format == "text":
text_report = comparator.generate_text_report()
if args.output:
text_file = f"{args.output}.txt"
with open(text_file, 'w', encoding='utf-8') as f:
f.write(text_report)
print(f"Text report saved to: {text_file}")
else:
html_report = comparator.generate_html_report()
if args.output:
html_file = f"{args.output}.html"
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_report)
print(f"HTML report saved to: {html_file}")
print(f"Open in browser: file://{os.path.abspath(html_file)}")
else:
html_file = "comparison_report.html"
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_report)
print(f"HTML report saved to: {html_file}")
print(f"Open in browser: file://{os.path.abspath(html_file)}")
if __name__ == "__main__":
main()
@davydmaker
Copy link
Author

Created to resolve divergent Git branch histories where traditional diff tools weren't sufficient. This script helps identify all code differences between directories/branches, making it easier to reconcile changes and recreate clean branch structures.

Generates detailed reports (text/HTML) for systematic review and application of differences.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment