Created
September 16, 2025 14:50
-
-
Save skorfmann/9af0b3e2272c052465f988beaa81dbcb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Git Metrics Analyzer - Derive commit statistics and developer activity metrics from git history | |
| Similar to GitClear's research on developer commit patterns and annual days active | |
| """ | |
| import subprocess | |
| import json | |
| import argparse | |
| from datetime import datetime, timedelta | |
| from collections import defaultdict | |
| import statistics | |
| import sys | |
| from pathlib import Path | |
| class GitMetricsAnalyzer: | |
| def __init__(self, repo_path="."): | |
| self.repo_path = Path(repo_path) | |
| def run_git_command(self, cmd): | |
| """Execute a git command and return the output""" | |
| try: | |
| result = subprocess.run( | |
| cmd, | |
| shell=True, | |
| cwd=self.repo_path, | |
| capture_output=True, | |
| text=True, | |
| check=True | |
| ) | |
| return result.stdout.strip() | |
| except subprocess.CalledProcessError as e: | |
| print(f"Git command failed: {e}") | |
| return None | |
| def get_commit_data(self, since_date=None, until_date=None): | |
| """Extract commit data from git log""" | |
| # Format: hash|author_email|author_name|date|timestamp | |
| format_string = "%H|%ae|%an|%ai|%at" | |
| cmd = f'git log --all --format="{format_string}"' | |
| if since_date: | |
| cmd += f' --since="{since_date}"' | |
| if until_date: | |
| cmd += f' --until="{until_date}"' | |
| output = self.run_git_command(cmd) | |
| if not output: | |
| return [] | |
| commits = [] | |
| for line in output.split('\n'): | |
| if not line: | |
| continue | |
| parts = line.split('|') | |
| if len(parts) >= 5: | |
| commits.append({ | |
| 'hash': parts[0], | |
| 'author_email': parts[1], | |
| 'author_name': parts[2], | |
| 'date': parts[3], | |
| 'timestamp': int(parts[4]) | |
| }) | |
| return commits | |
| def calculate_developer_metrics(self, commits): | |
| """Calculate metrics per developer""" | |
| developer_data = defaultdict(lambda: { | |
| 'commits': [], | |
| 'active_dates': set(), | |
| 'first_commit': None, | |
| 'last_commit': None | |
| }) | |
| for commit in commits: | |
| author = commit['author_email'] | |
| commit_date = datetime.fromisoformat(commit['date'].split()[0]) | |
| developer_data[author]['commits'].append(commit) | |
| developer_data[author]['active_dates'].add(commit_date.date()) | |
| if not developer_data[author]['first_commit']: | |
| developer_data[author]['first_commit'] = commit_date | |
| developer_data[author]['last_commit'] = commit_date | |
| return developer_data | |
| def calculate_annual_metrics(self, developer_data): | |
| """Calculate annual metrics for each developer""" | |
| annual_metrics = defaultdict(lambda: defaultdict(lambda: { | |
| 'commits': 0, | |
| 'active_days': set() | |
| })) | |
| for author, data in developer_data.items(): | |
| for commit in data['commits']: | |
| commit_date = datetime.fromisoformat(commit['date'].split()[0]) | |
| year = commit_date.year | |
| annual_metrics[author][year]['commits'] += 1 | |
| annual_metrics[author][year]['active_days'].add(commit_date.date()) | |
| # Convert sets to counts | |
| for author in annual_metrics: | |
| for year in annual_metrics[author]: | |
| annual_metrics[author][year]['active_days_count'] = len( | |
| annual_metrics[author][year]['active_days'] | |
| ) | |
| del annual_metrics[author][year]['active_days'] | |
| return annual_metrics | |
| def calculate_percentiles(self, values, percentiles=[10, 25, 50, 75, 90, 95, 99]): | |
| """Calculate percentiles for a list of values""" | |
| if not values: | |
| return {} | |
| sorted_values = sorted(values) | |
| result = {} | |
| for p in percentiles: | |
| index = int(len(sorted_values) * p / 100) | |
| if index >= len(sorted_values): | |
| index = len(sorted_values) - 1 | |
| result[f'p{p}'] = sorted_values[index] | |
| return result | |
| def analyze_repository(self, since_date=None, until_date=None): | |
| """Main analysis function""" | |
| print(f"Analyzing repository: {self.repo_path}") | |
| print("-" * 50) | |
| # Get all commits | |
| commits = self.get_commit_data(since_date, until_date) | |
| if not commits: | |
| print("No commits found in the repository.") | |
| return None | |
| print(f"Total commits analyzed: {len(commits)}") | |
| # Calculate developer metrics | |
| developer_data = self.calculate_developer_metrics(commits) | |
| print(f"Total unique contributors: {len(developer_data)}") | |
| # Calculate annual metrics | |
| annual_metrics = self.calculate_annual_metrics(developer_data) | |
| # Aggregate statistics | |
| all_commit_counts = [] | |
| all_active_days = [] | |
| annual_active_days = [] | |
| for author, data in developer_data.items(): | |
| all_commit_counts.append(len(data['commits'])) | |
| all_active_days.append(len(data['active_dates'])) | |
| for author in annual_metrics: | |
| for year in annual_metrics[author]: | |
| annual_active_days.append(annual_metrics[author][year]['active_days_count']) | |
| # Calculate percentiles | |
| commit_percentiles = self.calculate_percentiles(all_commit_counts) | |
| active_days_percentiles = self.calculate_percentiles(all_active_days) | |
| annual_days_percentiles = self.calculate_percentiles(annual_active_days) | |
| return { | |
| 'summary': { | |
| 'total_commits': len(commits), | |
| 'unique_contributors': len(developer_data), | |
| 'date_range': { | |
| 'first_commit': min(c['date'] for c in commits), | |
| 'last_commit': max(c['date'] for c in commits) | |
| } | |
| }, | |
| 'commit_count_percentiles': commit_percentiles, | |
| 'total_active_days_percentiles': active_days_percentiles, | |
| 'annual_active_days_percentiles': annual_days_percentiles, | |
| 'developer_metrics': { | |
| author: { | |
| 'total_commits': len(data['commits']), | |
| 'total_active_days': len(data['active_dates']), | |
| 'annual_breakdown': annual_metrics[author] | |
| } | |
| for author, data in developer_data.items() | |
| } | |
| } | |
| def print_results(self, results): | |
| """Pretty print the analysis results""" | |
| if not results: | |
| return | |
| print("\n" + "=" * 60) | |
| print("GIT METRICS ANALYSIS RESULTS") | |
| print("=" * 60) | |
| # Summary | |
| print("\nRepository Summary:") | |
| print(f" Total Commits: {results['summary']['total_commits']:,}") | |
| print(f" Unique Contributors: {results['summary']['unique_contributors']}") | |
| print(f" Date Range: {results['summary']['date_range']['first_commit'][:10]} to {results['summary']['date_range']['last_commit'][:10]}") | |
| # Commit Count Percentiles | |
| print("\nCommit Count Percentiles (per developer):") | |
| for percentile, value in results['commit_count_percentiles'].items(): | |
| print(f" {percentile}: {value} commits") | |
| # Active Days Percentiles | |
| print("\nTotal Active Days Percentiles (per developer):") | |
| for percentile, value in results['total_active_days_percentiles'].items(): | |
| print(f" {percentile}: {value} days") | |
| # Annual Active Days Percentiles | |
| print("\nAnnual Active Days Percentiles (developer-years):") | |
| for percentile, value in results['annual_active_days_percentiles'].items(): | |
| print(f" {percentile}: {value} days/year") | |
| # Top contributors | |
| print("\nTop 10 Contributors by Commit Count:") | |
| sorted_devs = sorted( | |
| results['developer_metrics'].items(), | |
| key=lambda x: x[1]['total_commits'], | |
| reverse=True | |
| )[:10] | |
| for i, (author, metrics) in enumerate(sorted_devs, 1): | |
| print(f" {i}. {author}: {metrics['total_commits']} commits, {metrics['total_active_days']} active days") | |
| def export_results(self, results, output_file): | |
| """Export results to JSON file""" | |
| with open(output_file, 'w') as f: | |
| json.dump(results, f, indent=2, default=str) | |
| print(f"\nResults exported to: {output_file}") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Analyze git repository metrics similar to GitClear research' | |
| ) | |
| parser.add_argument( | |
| 'repo_path', | |
| nargs='?', | |
| default='.', | |
| help='Path to git repository (default: current directory)' | |
| ) | |
| parser.add_argument( | |
| '--since', | |
| help='Analyze commits since this date (YYYY-MM-DD)' | |
| ) | |
| parser.add_argument( | |
| '--until', | |
| help='Analyze commits until this date (YYYY-MM-DD)' | |
| ) | |
| parser.add_argument( | |
| '--export', | |
| help='Export results to JSON file' | |
| ) | |
| parser.add_argument( | |
| '--top', | |
| type=int, | |
| default=10, | |
| help='Number of top contributors to show (default: 10)' | |
| ) | |
| args = parser.parse_args() | |
| # Verify git repository | |
| if not Path(args.repo_path).joinpath('.git').exists(): | |
| print(f"Error: {args.repo_path} is not a git repository") | |
| sys.exit(1) | |
| # Run analysis | |
| analyzer = GitMetricsAnalyzer(args.repo_path) | |
| results = analyzer.analyze_repository(args.since, args.until) | |
| # Display results | |
| if results: | |
| analyzer.print_results(results) | |
| # Export if requested | |
| if args.export: | |
| analyzer.export_results(results, args.export) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment