Skip to content

Instantly share code, notes, and snippets.

@skorfmann
Created September 16, 2025 14:50
Show Gist options
  • Select an option

  • Save skorfmann/9af0b3e2272c052465f988beaa81dbcb to your computer and use it in GitHub Desktop.

Select an option

Save skorfmann/9af0b3e2272c052465f988beaa81dbcb to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Git Metrics Analyzer - Derive commit statistics and developer activity metrics from git history
Similar to GitClear's research on developer commit patterns and annual days active
"""
import subprocess
import json
import argparse
from datetime import datetime, timedelta
from collections import defaultdict
import statistics
import sys
from pathlib import Path
class GitMetricsAnalyzer:
def __init__(self, repo_path="."):
self.repo_path = Path(repo_path)
def run_git_command(self, cmd):
"""Execute a git command and return the output"""
try:
result = subprocess.run(
cmd,
shell=True,
cwd=self.repo_path,
capture_output=True,
text=True,
check=True
)
return result.stdout.strip()
except subprocess.CalledProcessError as e:
print(f"Git command failed: {e}")
return None
def get_commit_data(self, since_date=None, until_date=None):
"""Extract commit data from git log"""
# Format: hash|author_email|author_name|date|timestamp
format_string = "%H|%ae|%an|%ai|%at"
cmd = f'git log --all --format="{format_string}"'
if since_date:
cmd += f' --since="{since_date}"'
if until_date:
cmd += f' --until="{until_date}"'
output = self.run_git_command(cmd)
if not output:
return []
commits = []
for line in output.split('\n'):
if not line:
continue
parts = line.split('|')
if len(parts) >= 5:
commits.append({
'hash': parts[0],
'author_email': parts[1],
'author_name': parts[2],
'date': parts[3],
'timestamp': int(parts[4])
})
return commits
def calculate_developer_metrics(self, commits):
"""Calculate metrics per developer"""
developer_data = defaultdict(lambda: {
'commits': [],
'active_dates': set(),
'first_commit': None,
'last_commit': None
})
for commit in commits:
author = commit['author_email']
commit_date = datetime.fromisoformat(commit['date'].split()[0])
developer_data[author]['commits'].append(commit)
developer_data[author]['active_dates'].add(commit_date.date())
if not developer_data[author]['first_commit']:
developer_data[author]['first_commit'] = commit_date
developer_data[author]['last_commit'] = commit_date
return developer_data
def calculate_annual_metrics(self, developer_data):
"""Calculate annual metrics for each developer"""
annual_metrics = defaultdict(lambda: defaultdict(lambda: {
'commits': 0,
'active_days': set()
}))
for author, data in developer_data.items():
for commit in data['commits']:
commit_date = datetime.fromisoformat(commit['date'].split()[0])
year = commit_date.year
annual_metrics[author][year]['commits'] += 1
annual_metrics[author][year]['active_days'].add(commit_date.date())
# Convert sets to counts
for author in annual_metrics:
for year in annual_metrics[author]:
annual_metrics[author][year]['active_days_count'] = len(
annual_metrics[author][year]['active_days']
)
del annual_metrics[author][year]['active_days']
return annual_metrics
def calculate_percentiles(self, values, percentiles=[10, 25, 50, 75, 90, 95, 99]):
"""Calculate percentiles for a list of values"""
if not values:
return {}
sorted_values = sorted(values)
result = {}
for p in percentiles:
index = int(len(sorted_values) * p / 100)
if index >= len(sorted_values):
index = len(sorted_values) - 1
result[f'p{p}'] = sorted_values[index]
return result
def analyze_repository(self, since_date=None, until_date=None):
"""Main analysis function"""
print(f"Analyzing repository: {self.repo_path}")
print("-" * 50)
# Get all commits
commits = self.get_commit_data(since_date, until_date)
if not commits:
print("No commits found in the repository.")
return None
print(f"Total commits analyzed: {len(commits)}")
# Calculate developer metrics
developer_data = self.calculate_developer_metrics(commits)
print(f"Total unique contributors: {len(developer_data)}")
# Calculate annual metrics
annual_metrics = self.calculate_annual_metrics(developer_data)
# Aggregate statistics
all_commit_counts = []
all_active_days = []
annual_active_days = []
for author, data in developer_data.items():
all_commit_counts.append(len(data['commits']))
all_active_days.append(len(data['active_dates']))
for author in annual_metrics:
for year in annual_metrics[author]:
annual_active_days.append(annual_metrics[author][year]['active_days_count'])
# Calculate percentiles
commit_percentiles = self.calculate_percentiles(all_commit_counts)
active_days_percentiles = self.calculate_percentiles(all_active_days)
annual_days_percentiles = self.calculate_percentiles(annual_active_days)
return {
'summary': {
'total_commits': len(commits),
'unique_contributors': len(developer_data),
'date_range': {
'first_commit': min(c['date'] for c in commits),
'last_commit': max(c['date'] for c in commits)
}
},
'commit_count_percentiles': commit_percentiles,
'total_active_days_percentiles': active_days_percentiles,
'annual_active_days_percentiles': annual_days_percentiles,
'developer_metrics': {
author: {
'total_commits': len(data['commits']),
'total_active_days': len(data['active_dates']),
'annual_breakdown': annual_metrics[author]
}
for author, data in developer_data.items()
}
}
def print_results(self, results):
"""Pretty print the analysis results"""
if not results:
return
print("\n" + "=" * 60)
print("GIT METRICS ANALYSIS RESULTS")
print("=" * 60)
# Summary
print("\nRepository Summary:")
print(f" Total Commits: {results['summary']['total_commits']:,}")
print(f" Unique Contributors: {results['summary']['unique_contributors']}")
print(f" Date Range: {results['summary']['date_range']['first_commit'][:10]} to {results['summary']['date_range']['last_commit'][:10]}")
# Commit Count Percentiles
print("\nCommit Count Percentiles (per developer):")
for percentile, value in results['commit_count_percentiles'].items():
print(f" {percentile}: {value} commits")
# Active Days Percentiles
print("\nTotal Active Days Percentiles (per developer):")
for percentile, value in results['total_active_days_percentiles'].items():
print(f" {percentile}: {value} days")
# Annual Active Days Percentiles
print("\nAnnual Active Days Percentiles (developer-years):")
for percentile, value in results['annual_active_days_percentiles'].items():
print(f" {percentile}: {value} days/year")
# Top contributors
print("\nTop 10 Contributors by Commit Count:")
sorted_devs = sorted(
results['developer_metrics'].items(),
key=lambda x: x[1]['total_commits'],
reverse=True
)[:10]
for i, (author, metrics) in enumerate(sorted_devs, 1):
print(f" {i}. {author}: {metrics['total_commits']} commits, {metrics['total_active_days']} active days")
def export_results(self, results, output_file):
"""Export results to JSON file"""
with open(output_file, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"\nResults exported to: {output_file}")
def main():
parser = argparse.ArgumentParser(
description='Analyze git repository metrics similar to GitClear research'
)
parser.add_argument(
'repo_path',
nargs='?',
default='.',
help='Path to git repository (default: current directory)'
)
parser.add_argument(
'--since',
help='Analyze commits since this date (YYYY-MM-DD)'
)
parser.add_argument(
'--until',
help='Analyze commits until this date (YYYY-MM-DD)'
)
parser.add_argument(
'--export',
help='Export results to JSON file'
)
parser.add_argument(
'--top',
type=int,
default=10,
help='Number of top contributors to show (default: 10)'
)
args = parser.parse_args()
# Verify git repository
if not Path(args.repo_path).joinpath('.git').exists():
print(f"Error: {args.repo_path} is not a git repository")
sys.exit(1)
# Run analysis
analyzer = GitMetricsAnalyzer(args.repo_path)
results = analyzer.analyze_repository(args.since, args.until)
# Display results
if results:
analyzer.print_results(results)
# Export if requested
if args.export:
analyzer.export_results(results, args.export)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment