Last active
January 29, 2026 06:29
-
-
Save papamoose/51c28536d30f8e4f2c249cba8bfb0749 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Slurm Job Timeline Visualizer | |
| Generates a Gantt chart from sacct output showing job execution and idle periods. | |
| """ | |
| import sys | |
| import argparse | |
| from datetime import datetime | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patches as mpatches | |
| from matplotlib.dates import DateFormatter | |
| import pandas as pd | |
| def parse_sacct_line(line): | |
| """Parse a single line of sacct output.""" | |
| parts = line.strip().split('|') | |
| if len(parts) < 7: | |
| return None | |
| jobid, account, state, start, end, flags, nnodes = parts[:7] | |
| # Skip jobs without valid start/end times | |
| if start == 'None' or start == 'Unknown': | |
| return None | |
| if end == 'None' or end == 'Unknown': | |
| return None | |
| # Determine scheduling method from flags | |
| sched_method = 'Unknown' | |
| if 'SchedMain' in flags: | |
| sched_method = 'SchedMain' | |
| elif 'SchedBackfill' in flags: | |
| sched_method = 'SchedBackfill' | |
| elif 'SchedSubmit' in flags: | |
| sched_method = 'SchedSubmit' | |
| try: | |
| start_dt = datetime.strptime(start, '%Y-%m-%dT%H:%M:%S') | |
| end_dt = datetime.strptime(end, '%Y-%m-%dT%H:%M:%S') | |
| return { | |
| 'jobid': jobid, | |
| 'state': state.split()[0], # Handle "CANCELLED by 5000" -> "CANCELLED" | |
| 'start': start_dt, | |
| 'end': end_dt, | |
| 'nnodes': int(nnodes), | |
| 'sched_method': sched_method, | |
| 'account': account | |
| } | |
| except (ValueError, TypeError): | |
| return None | |
| def read_sacct_data(input_file=None): | |
| """Read and parse sacct data from file or stdin.""" | |
| jobs = [] | |
| if input_file: | |
| with open(input_file, 'r') as f: | |
| lines = f.readlines() | |
| else: | |
| lines = sys.stdin.readlines() | |
| for line in lines: | |
| job = parse_sacct_line(line) | |
| if job: | |
| jobs.append(job) | |
| return sorted(jobs, key=lambda x: x['start']) | |
| def find_gaps(jobs): | |
| """Identify gaps between jobs.""" | |
| gaps = [] | |
| for i in range(len(jobs) - 1): | |
| current_end = jobs[i]['end'] | |
| next_start = jobs[i + 1]['start'] | |
| if next_start > current_end: | |
| gaps.append({ | |
| 'start': current_end, | |
| 'end': next_start | |
| }) | |
| return gaps | |
| def create_gantt_chart(jobs, gaps, output_file='slurm_timeline.svg', figsize=(14, 8), | |
| color_by='scheduler', show_idle_labels=True): | |
| """Create a Gantt chart visualization. | |
| Args: | |
| jobs: List of job dictionaries | |
| gaps: List of gap dictionaries | |
| output_file: Output filename | |
| figsize: Tuple of (width, height) in inches | |
| color_by: How to color jobs - 'scheduler' or 'account' | |
| show_idle_labels: Whether to show "IDLE" labels on gaps | |
| """ | |
| if not jobs: | |
| print("No valid jobs to visualize.") | |
| return | |
| # Get all unique nodes from jobs | |
| all_nodes = set() | |
| for job in jobs: | |
| # Jobs can run on multiple nodes | |
| for i in range(job['nnodes']): | |
| all_nodes.add(i) | |
| # If we don't have node info, just use a single row | |
| if not all_nodes or max(all_nodes) == 0: | |
| all_nodes = {0} | |
| node_list = sorted(all_nodes) | |
| num_nodes = len(node_list) | |
| # Adjust figure height based on number of nodes | |
| adjusted_height = max(8, num_nodes * 0.8) | |
| fig, ax = plt.subplots(figsize=(figsize[0], adjusted_height)) | |
| # Determine coloring scheme | |
| if color_by == 'account': | |
| # Get all unique accounts and assign colors | |
| unique_accounts = sorted(set(job['account'] for job in jobs)) | |
| # Generate distinct colors using a colormap | |
| try: | |
| cmap = plt.colormaps['tab20'] # Up to 20 distinct colors | |
| if len(unique_accounts) > 20: | |
| cmap = plt.colormaps['hsv'] # More colors but less distinct | |
| except (KeyError, AttributeError): | |
| # Fallback for older matplotlib versions | |
| import matplotlib.cm as cm | |
| cmap = cm.get_cmap('tab20') | |
| if len(unique_accounts) > 20: | |
| cmap = cm.get_cmap('hsv') | |
| color_map = {} | |
| for i, account in enumerate(unique_accounts): | |
| import matplotlib.colors as mcolors | |
| color_map[account] = mcolors.rgb2hex(cmap(i / max(len(unique_accounts), 1))) | |
| def get_job_color(job): | |
| return color_map.get(job['account'], '#95a5a6') | |
| legend_title = 'Accounts' | |
| legend_items = [(account, color_map[account]) for account in unique_accounts] | |
| else: # color_by == 'scheduler' | |
| # Color mapping for scheduling methods | |
| sched_colors = { | |
| 'SchedMain': '#3498db', # Blue | |
| 'SchedSubmit': '#2ecc71', # Green | |
| 'SchedBackfill': '#e74c3c', # Red | |
| 'Unknown': '#95a5a6' # Gray | |
| } | |
| def get_job_color(job): | |
| return sched_colors.get(job['sched_method'], sched_colors['Unknown']) | |
| legend_title = 'Scheduling Methods' | |
| # Only include methods that are actually used | |
| used_methods = set(job['sched_method'] for job in jobs) | |
| legend_items = [(method, sched_colors[method]) for method in sched_colors.keys() | |
| if method in used_methods] | |
| # Track occupancy for each node to calculate gaps | |
| node_jobs = {node: [] for node in node_list} | |
| # Assign jobs to nodes and plot them | |
| for job in jobs: | |
| duration = (job['end'] - job['start']).total_seconds() / 3600 # Convert to hours | |
| color = get_job_color(job) | |
| # Different edge colors based on state | |
| edge_color = 'black' | |
| line_width = 0.5 | |
| if job['state'] == 'FAILED': | |
| edge_color = '#c0392b' # Dark red | |
| line_width = 2 | |
| elif job['state'] == 'CANCELLED': | |
| edge_color = '#d68910' # Dark orange | |
| line_width = 2 | |
| # Plot job on each node it occupies | |
| nodes_used = min(job['nnodes'], len(node_list)) if job['nnodes'] > 0 else 1 | |
| for node_idx in range(nodes_used): | |
| node = node_list[node_idx] | |
| y_pos = node | |
| # Track this job on this node | |
| node_jobs[node].append(job) | |
| ax.barh(y_pos, duration, left=job['start'], height=0.8, | |
| color=color, edgecolor=edge_color, linewidth=line_width) | |
| # Calculate and plot gaps for each node | |
| for node in node_list: | |
| if not node_jobs[node]: | |
| continue | |
| # Sort jobs on this node by start time | |
| sorted_jobs = sorted(node_jobs[node], key=lambda x: x['start']) | |
| # Find gaps between consecutive jobs on this node | |
| for i in range(len(sorted_jobs) - 1): | |
| current_end = sorted_jobs[i]['end'] | |
| next_start = sorted_jobs[i + 1]['start'] | |
| if next_start > current_end: | |
| duration = (next_start - current_end).total_seconds() / 3600 | |
| y_pos = node | |
| ax.barh(y_pos, duration, left=current_end, height=0.8, | |
| color='lightgray', edgecolor='gray', linewidth=0.5, | |
| linestyle='--', alpha=0.3) | |
| # Add "IDLE" label for significant gaps if enabled | |
| if show_idle_labels and duration > 0.2: # Only label gaps > 12 minutes | |
| mid_point = current_end + (next_start - current_end) / 2 | |
| ax.text(mid_point, y_pos, 'IDLE', | |
| ha='center', va='center', fontsize=6, | |
| style='italic', color='gray', alpha=0.7) | |
| # Format the plot | |
| ax.set_yticks(node_list) | |
| ax.set_yticklabels([f'Node {node}' for node in node_list]) | |
| ax.set_ylabel('Nodes', fontsize=12) | |
| ax.set_xlabel('Time', fontsize=12) | |
| ax.set_title('Slurm Job Timeline by Node', fontsize=14, weight='bold') | |
| # Scope x-axis to earliest start and latest end time | |
| earliest_start = min(job['start'] for job in jobs) | |
| latest_end = max(job['end'] for job in jobs) | |
| ax.set_xlim(earliest_start, latest_end) | |
| # Format x-axis to show dates nicely | |
| ax.xaxis.set_major_formatter(DateFormatter('%Y-%m-%d\n%H:%M')) | |
| plt.xticks(rotation=0, ha='center') | |
| # Add legend | |
| legend_elements = [] | |
| # Add color-coding legend items | |
| for label, color in legend_items: | |
| legend_elements.append(mpatches.Patch(color=color, label=label)) | |
| # Add state indicators | |
| legend_elements.extend([ | |
| mpatches.Patch(facecolor='lightgray', edgecolor='gray', | |
| linestyle='--', label='Node Idle', alpha=0.3), | |
| mpatches.Patch(facecolor='white', edgecolor='#c0392b', | |
| linewidth=2, label='Failed Job'), | |
| mpatches.Patch(facecolor='white', edgecolor='#d68910', | |
| linewidth=2, label='Cancelled Job') | |
| ]) | |
| ax.legend(handles=legend_elements, loc='upper right', title=legend_title) | |
| # Add grid | |
| ax.grid(axis='x', alpha=0.3, linestyle=':') | |
| ax.grid(axis='y', alpha=0.2, linestyle=':') | |
| # Calculate and display statistics | |
| total_time = (jobs[-1]['end'] - jobs[0]['start']).total_seconds() / 3600 | |
| job_time = sum((j['end'] - j['start']).total_seconds() for j in jobs) / 3600 | |
| idle_time = sum((g['end'] - g['start']).total_seconds() for g in gaps) / 3600 | |
| utilization = (job_time / total_time * 100) if total_time > 0 else 0 | |
| # Count by scheduling method | |
| sched_counts = {} | |
| for job in jobs: | |
| method = job['sched_method'] | |
| sched_counts[method] = sched_counts.get(method, 0) + 1 | |
| stats_text = f"Total Jobs: {len(jobs)} | Nodes: {num_nodes} | Timeline: {total_time:.1f}h | " \ | |
| f"Active: {job_time:.1f}h | Idle: {idle_time:.1f}h | " \ | |
| f"Utilization: {utilization:.1f}%" | |
| fig.text(0.5, 0.02, stats_text, ha='center', fontsize=10, | |
| bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) | |
| plt.tight_layout(rect=[0, 0.04, 1, 1]) | |
| plt.savefig(output_file, format='svg', bbox_inches='tight') | |
| print(f"Gantt chart saved to: {output_file}") | |
| # Print statistics to console | |
| print(f"\n{'='*60}") | |
| print(f"Timeline Statistics:") | |
| print(f"{'='*60}") | |
| print(f"Total jobs: {len(jobs)}") | |
| print(f"Total nodes: {num_nodes}") | |
| print(f"Completed: {sum(1 for j in jobs if j['state'] == 'COMPLETED')}") | |
| print(f"Failed: {sum(1 for j in jobs if j['state'] == 'FAILED')}") | |
| print(f"Cancelled: {sum(1 for j in jobs if j['state'] == 'CANCELLED')}") | |
| if color_by == 'account': | |
| print(f"\nAccounts:") | |
| account_counts = {} | |
| for job in jobs: | |
| acc = job['account'] | |
| account_counts[acc] = account_counts.get(acc, 0) + 1 | |
| for account, count in sorted(account_counts.items()): | |
| print(f" {account:20s} {count}") | |
| else: | |
| print(f"\nScheduling Methods:") | |
| for method, count in sorted(sched_counts.items()): | |
| print(f" {method:20s} {count}") | |
| print(f"\nTimeline span: {total_time:.2f} hours") | |
| print(f"Active time: {job_time:.2f} hours") | |
| print(f"Idle time: {idle_time:.2f} hours") | |
| print(f"Utilization: {utilization:.2f}%") | |
| print(f"Number of gaps: {len(gaps)}") | |
| print(f"{'='*60}\n") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Visualize Slurm job timeline from sacct output', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=''' | |
| Examples: | |
| # Read from file | |
| %(prog)s -i sacct_output.txt -o timeline.svg | |
| # Pipe from sacct command (note: account field is REQUIRED) | |
| sacct -a -X --starttime=2020-01-01 -P -n -o jobid,account,state,start,end,flags,nnodes | %(prog)s | |
| # Color by account instead of scheduler | |
| sacct -a -X --starttime=2020-01-01 -P -n -o jobid,account,state,start,end,flags,nnodes | %(prog)s --color-by account | |
| # Custom figure size without idle labels | |
| %(prog)s -i sacct_output.txt --width 20 --height 10 --no-idle-labels | |
| ''' | |
| ) | |
| parser.add_argument('-i', '--input', help='Input file with sacct output (uses stdin if not specified)') | |
| parser.add_argument('-o', '--output', default='slurm_timeline.svg', | |
| help='Output image file (default: slurm_timeline.svg)') | |
| parser.add_argument('--width', type=float, default=14, | |
| help='Figure width in inches (default: 14)') | |
| parser.add_argument('--height', type=float, default=8, | |
| help='Figure height in inches (default: 8)') | |
| parser.add_argument('--color-by', choices=['scheduler', 'account'], default='scheduler', | |
| help='Color jobs by scheduler method or account (default: scheduler)') | |
| parser.add_argument('--no-idle-labels', action='store_true', | |
| help='Hide "IDLE" labels on gaps between jobs') | |
| args = parser.parse_args() | |
| # Read and process data | |
| print("Reading sacct data...") | |
| jobs = read_sacct_data(args.input) | |
| if not jobs: | |
| print("No valid jobs found in input.") | |
| sys.exit(1) | |
| print(f"Found {len(jobs)} valid jobs.") | |
| # Find gaps | |
| gaps = find_gaps(jobs) | |
| print(f"Found {len(gaps)} idle periods.") | |
| # Create visualization | |
| create_gantt_chart(jobs, gaps, args.output, figsize=(args.width, args.height), | |
| color_by=args.color_by, show_idle_labels=not args.no_idle_labels) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment