Last active
August 20, 2025 23:37
-
-
Save soodoku/0b2d8e84d7c325382381a9c18893c72a to your computer and use it in GitHub Desktop.
Ribeiro Style Robustness Checks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """sentiment_ribeiro_style_checklist.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1uyBg9-sidauCSSO6dcztl552HVSAgkUJ | |
| """ | |
| """ | |
| Sentiment Model Robustness Testing Script | |
| Implements Ribeiro-style tests for evaluating model robustness | |
| """ | |
| import os | |
| import json | |
| import random | |
| import pandas as pd | |
| import numpy as np | |
| from typing import List, Dict, Tuple, Optional, Callable | |
| from dataclasses import dataclass | |
| from collections import defaultdict | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # Core dependencies | |
| from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification | |
| import torch | |
| from datasets import load_dataset | |
| from tqdm import tqdm | |
| import openai | |
| from scipy.stats import pearsonr | |
| import re | |
| # Configuration | |
| @dataclass | |
| class TestConfig: | |
| """Configuration for robustness tests""" | |
| use_openai: bool = False | |
| openai_api_key: Optional[str] = None | |
| max_samples_per_dataset: int = 500 | |
| random_seed: int = 42 | |
| device: str = "cuda" if torch.cuda.is_available() else "cpu" | |
| class PerturbationGenerator: | |
| """Generate various perturbations for robustness testing""" | |
| def __init__(self, config: TestConfig): | |
| self.config = config | |
| if config.use_openai and config.openai_api_key: | |
| openai.api_key = config.openai_api_key | |
| # Intensity modifiers | |
| self.intensifiers = { | |
| 'positive': ['absolutely', 'extremely', 'incredibly', 'totally', 'really', 'very'], | |
| 'negative': ['somewhat', 'slightly', 'a bit', 'kind of', 'sort of', 'fairly'] | |
| } | |
| # Spurious words that shouldn't affect sentiment | |
| self.spurious_words = [ | |
| 'literally', 'basically', 'actually', 'obviously', 'frankly', | |
| 'honestly', 'seriously', 'definitely', 'certainly', 'surely' | |
| ] | |
| # Negation words | |
| self.negation_words = ['not', "n't", 'never', 'no', 'none', 'neither', 'nor'] | |
| def add_intensity_modifier(self, text: str, sentiment: str, intensity_type: str = 'positive') -> str: | |
| """Add intensity modifiers to text""" | |
| words = text.split() | |
| # Find adjectives/adverbs (simplified heuristic) | |
| adj_positions = [] | |
| for i, word in enumerate(words): | |
| if any(word.endswith(suffix) for suffix in ['ful', 'less', 'ous', 'ive', 'able', 'ed']): | |
| adj_positions.append(i) | |
| if adj_positions: | |
| pos = random.choice(adj_positions) | |
| modifier = random.choice(self.intensifiers[intensity_type]) | |
| words.insert(pos, modifier) | |
| return ' '.join(words) | |
| def add_spurious_words(self, text: str, num_words: int = 2) -> str: | |
| """Insert spurious words that shouldn't affect sentiment""" | |
| words = text.split() | |
| for _ in range(min(num_words, len(self.spurious_words))): | |
| spurious = random.choice(self.spurious_words) | |
| position = random.randint(0, len(words)) | |
| words.insert(position, spurious) | |
| return ' '.join(words) | |
| def add_negation(self, text: str) -> str: | |
| """Add negation to change sentiment""" | |
| words = text.split() | |
| # Find verbs or adjectives to negate | |
| neg_positions = [] | |
| for i, word in enumerate(words): | |
| if i > 0 and word in ['is', 'was', 'are', 'were', 'am', 'be', 'been']: | |
| neg_positions.append(i) | |
| if neg_positions: | |
| pos = random.choice(neg_positions) | |
| words.insert(pos + 1, 'not') | |
| else: | |
| # Fallback: add "not" at the beginning | |
| words.insert(0, "Not") | |
| return ' '.join(words) | |
| def generate_counterfactual_openai(self, text: str, target_sentiment: str) -> Optional[str]: | |
| """Use OpenAI to generate counterfactual with opposite sentiment""" | |
| if not self.config.use_openai: | |
| return None | |
| try: | |
| prompt = f""" | |
| Rewrite the following text to have {target_sentiment} sentiment while keeping the same topic and structure: | |
| Original: {text} | |
| Rewritten: | |
| """ | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant that rewrites text sentiment."}, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=150, | |
| temperature=0.7 | |
| ) | |
| return response.choices[0].message['content'].strip() | |
| except Exception as e: | |
| print(f"OpenAI API error: {e}") | |
| return None | |
| def swap_sentiment_words(self, text: str) -> str: | |
| """Simple word-level sentiment swapping""" | |
| positive_words = { | |
| 'good': 'bad', 'great': 'terrible', 'excellent': 'awful', | |
| 'love': 'hate', 'beautiful': 'ugly', 'amazing': 'horrible', | |
| 'wonderful': 'dreadful', 'fantastic': 'disastrous', 'best': 'worst' | |
| } | |
| negative_words = {v: k for k, v in positive_words.items()} | |
| words = text.split() | |
| for i, word in enumerate(words): | |
| word_lower = word.lower().strip('.,!?') | |
| if word_lower in positive_words: | |
| words[i] = word.replace(word_lower, positive_words[word_lower]) | |
| elif word_lower in negative_words: | |
| words[i] = word.replace(word_lower, negative_words[word_lower]) | |
| return ' '.join(words) | |
| class ModelEvaluator: | |
| """Evaluate models on original and perturbed data""" | |
| def __init__(self, models: Dict[str, pipeline], config: TestConfig): | |
| self.models = models | |
| self.config = config | |
| self.results = defaultdict(lambda: defaultdict(list)) | |
| def predict_sentiment(self, model: pipeline, texts: List[str]) -> List[Dict]: | |
| """Get sentiment predictions from model""" | |
| predictions = [] | |
| model_name = model.model.name_or_path if hasattr(model.model, 'name_or_path') else "" | |
| for text in tqdm(texts, desc="Predicting", leave=False): | |
| try: | |
| result = model(text[:512]) # Truncate long texts | |
| # Handle different output formats | |
| if isinstance(result, list) and len(result) > 0: | |
| result = result[0] | |
| # Special handling for nlptown model (5-star ratings) | |
| if 'nlptown' in model_name: | |
| # This model outputs 1-5 star ratings | |
| if isinstance(result, dict): | |
| label = result['label'] | |
| # Convert stars to sentiment: 1-2 stars = negative, 4-5 = positive, 3 = neutral | |
| if '1' in label or '2' in label: | |
| pos_score = 0.2 | |
| sentiment = 'negative' | |
| elif '4' in label or '5' in label: | |
| pos_score = 0.8 | |
| sentiment = 'positive' | |
| else: # 3 stars | |
| pos_score = 0.5 | |
| sentiment = 'neutral' | |
| else: | |
| # Find the highest scoring label | |
| max_score = 0 | |
| max_label = None | |
| for item in result: | |
| if item['score'] > max_score: | |
| max_score = item['score'] | |
| max_label = item['label'] | |
| if '1' in max_label or '2' in max_label: | |
| pos_score = 0.2 | |
| sentiment = 'negative' | |
| elif '4' in max_label or '5' in max_label: | |
| pos_score = 0.8 | |
| sentiment = 'positive' | |
| else: | |
| pos_score = 0.5 | |
| sentiment = 'neutral' | |
| else: | |
| # Standard sentiment models | |
| pos_score = 0.0 | |
| if isinstance(result, dict): | |
| if result['label'].lower() in ['positive', 'pos', '1', 'label_1']: | |
| pos_score = result['score'] | |
| elif result['label'].lower() in ['negative', 'neg', '0', 'label_0']: | |
| pos_score = 1 - result['score'] | |
| else: | |
| for item in result: | |
| if item['label'].lower() in ['positive', 'pos', '1', 'label_1']: | |
| pos_score = item['score'] | |
| break | |
| elif item['label'].lower() in ['negative', 'neg', '0', 'label_0']: | |
| pos_score = 1 - item['score'] | |
| break | |
| sentiment = 'positive' if pos_score > 0.5 else 'negative' | |
| predictions.append({ | |
| 'text': text, | |
| 'score': pos_score, | |
| 'label': sentiment | |
| }) | |
| except Exception as e: | |
| print(f"Prediction error: {e}") | |
| predictions.append({ | |
| 'text': text, | |
| 'score': 0.5, | |
| 'label': 'neutral' | |
| }) | |
| return predictions | |
| def evaluate_perturbation(self, | |
| model_name: str, | |
| model: pipeline, | |
| original_texts: List[str], | |
| perturbed_texts: List[str], | |
| original_labels: List[int], | |
| test_name: str) -> Dict: | |
| """Evaluate model on original vs perturbed texts""" | |
| # Get predictions | |
| orig_preds = self.predict_sentiment(model, original_texts) | |
| pert_preds = self.predict_sentiment(model, perturbed_texts) | |
| # Calculate metrics | |
| orig_accuracy = np.mean([ | |
| (p['label'] == 'positive' and l == 1) or | |
| (p['label'] == 'negative' and l == 0) | |
| for p, l in zip(orig_preds, original_labels) | |
| ]) | |
| pert_accuracy = np.mean([ | |
| (p['label'] == 'positive' and l == 1) or | |
| (p['label'] == 'negative' and l == 0) | |
| for p, l in zip(pert_preds, original_labels) | |
| ]) | |
| # Consistency: how often model gives same prediction | |
| consistency = np.mean([ | |
| o['label'] == p['label'] | |
| for o, p in zip(orig_preds, pert_preds) | |
| ]) | |
| # Score correlation | |
| orig_scores = [p['score'] for p in orig_preds] | |
| pert_scores = [p['score'] for p in pert_preds] | |
| correlation, _ = pearsonr(orig_scores, pert_scores) | |
| # Average confidence change | |
| conf_change = np.mean([ | |
| abs(o['score'] - p['score']) | |
| for o, p in zip(orig_preds, pert_preds) | |
| ]) | |
| return { | |
| 'model': model_name, | |
| 'test': test_name, | |
| 'orig_accuracy': orig_accuracy, | |
| 'pert_accuracy': pert_accuracy, | |
| 'consistency': consistency, | |
| 'correlation': correlation, | |
| 'avg_conf_change': conf_change, | |
| 'accuracy_drop': orig_accuracy - pert_accuracy | |
| } | |
| class RobustnessTestSuite: | |
| """Main test suite for robustness evaluation""" | |
| def __init__(self, config: TestConfig): | |
| self.config = config | |
| self.generator = PerturbationGenerator(config) | |
| self.results = [] | |
| def load_models(self, model_names: List[str]) -> Dict[str, pipeline]: | |
| """Load sentiment analysis models""" | |
| models = {} | |
| for name in model_names: | |
| print(f"Loading model: {name}") | |
| try: | |
| models[name] = pipeline( | |
| "sentiment-analysis", | |
| model=name, | |
| device=0 if self.config.device == "cuda" else -1 | |
| ) | |
| except Exception as e: | |
| print(f"Failed to load {name}: {e}") | |
| return models | |
| def load_datasets(self, dataset_names: List[str]) -> List[Tuple[List[str], List[int], str]]: | |
| """Load sentiment datasets""" | |
| datasets = [] | |
| for name in dataset_names: | |
| print(f"Loading dataset: {name}") | |
| try: | |
| if name == "imdb": | |
| ds = load_dataset("imdb", split="test") | |
| texts = ds['text'][:self.config.max_samples_per_dataset] | |
| labels = ds['label'][:self.config.max_samples_per_dataset] | |
| elif name == "sst2": | |
| ds = load_dataset("glue", "sst2", split="validation") | |
| texts = ds['sentence'][:self.config.max_samples_per_dataset] | |
| labels = ds['label'][:self.config.max_samples_per_dataset] | |
| elif name == "amazon_polarity": | |
| ds = load_dataset("amazon_polarity", split="test") | |
| texts = [t[:1000] for t in ds['content'][:self.config.max_samples_per_dataset]] # Truncate | |
| labels = ds['label'][:self.config.max_samples_per_dataset] | |
| else: | |
| continue | |
| datasets.append((texts, labels, name)) | |
| except Exception as e: | |
| print(f"Failed to load {name}: {e}") | |
| return datasets | |
| def run_tests(self, models: Dict[str, pipeline], datasets: List[Tuple]) -> pd.DataFrame: | |
| """Run all robustness tests""" | |
| evaluator = ModelEvaluator(models, self.config) | |
| tests = [ | |
| ("intensity_positive", lambda t, l: self.generator.add_intensity_modifier(t, l, 'positive')), | |
| ("intensity_negative", lambda t, l: self.generator.add_intensity_modifier(t, l, 'negative')), | |
| ("spurious_words", lambda t, l: self.generator.add_spurious_words(t)), | |
| ("negation", lambda t, l: self.generator.add_negation(t)), | |
| ("sentiment_swap", lambda t, l: self.generator.swap_sentiment_words(t)) | |
| ] | |
| # Add OpenAI counterfactual if enabled | |
| if self.config.use_openai: | |
| tests.append(("openai_counterfactual", | |
| lambda t, l: self.generator.generate_counterfactual_openai( | |
| t, "negative" if l == 1 else "positive") or t)) | |
| for dataset_texts, dataset_labels, dataset_name in datasets: | |
| print(f"\nTesting on {dataset_name}") | |
| for test_name, perturbation_fn in tests: | |
| print(f" Running {test_name} test...") | |
| # Generate perturbations | |
| perturbed_texts = [] | |
| for text, label in zip(dataset_texts, dataset_labels): | |
| try: | |
| perturbed = perturbation_fn(text, label) | |
| perturbed_texts.append(perturbed) | |
| except: | |
| perturbed_texts.append(text) # Fallback to original | |
| # Evaluate each model | |
| for model_name, model in models.items(): | |
| result = evaluator.evaluate_perturbation( | |
| model_name, model, | |
| dataset_texts, perturbed_texts, | |
| dataset_labels, | |
| f"{dataset_name}_{test_name}" | |
| ) | |
| self.results.append(result) | |
| # Create results DataFrame | |
| df = pd.DataFrame(self.results) | |
| return df | |
| def create_summary_table(self, df: pd.DataFrame) -> pd.DataFrame: | |
| """Create summary table of results""" | |
| # Pivot table for better visualization | |
| summary = df.pivot_table( | |
| index=['model', 'test'], | |
| values=['orig_accuracy', 'pert_accuracy', 'consistency', 'correlation', 'avg_conf_change'], | |
| aggfunc='mean' | |
| ).round(3) | |
| # Add robustness score (composite metric) | |
| # Handle NaN correlations by using 0.5 as neutral value | |
| correlation_filled = summary['correlation'].fillna(0.5) | |
| summary['robustness_score'] = ( | |
| summary['consistency'] * 0.3 + | |
| correlation_filled * 0.3 + | |
| (1 - summary['avg_conf_change']) * 0.2 + | |
| (1 - abs(summary['orig_accuracy'] - summary['pert_accuracy'])) * 0.2 | |
| ).round(3) | |
| return summary | |
| def generate_detailed_report(self, df: pd.DataFrame, summary: pd.DataFrame) -> str: | |
| """Generate a detailed markdown report of the results""" | |
| report = [] | |
| report.append("# Sentiment Model Robustness Analysis Report\n") | |
| report.append(f"**Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\n") | |
| report.append(f"**Total Tests**: {len(df)}\n") | |
| report.append(f"**Models Tested**: {df['model'].nunique()}\n") | |
| report.append(f"**Datasets Used**: {len(set([t.split('_')[0] for t in df['test'].unique()]))}\n\n") | |
| # Executive Summary | |
| report.append("## Executive Summary\n") | |
| # Best performing model | |
| best_model = summary.groupby(level=0)['robustness_score'].mean().idxmax() | |
| report.append(f"- **Most Robust Model**: {best_model}\n") | |
| # Most challenging perturbation | |
| worst_pert = summary.groupby(level=1)['consistency'].mean().idxmin() | |
| report.append(f"- **Most Challenging Perturbation**: {worst_pert}\n") | |
| # Critical vulnerabilities | |
| critical = df[df['consistency'] < 0.7] | |
| if not critical.empty: | |
| report.append(f"- **Critical Vulnerabilities Found**: {len(critical)} tests with <70% consistency\n") | |
| report.append("\n## Detailed Findings\n") | |
| # Per-model analysis | |
| for model in df['model'].unique(): | |
| model_data = df[df['model'] == model] | |
| model_summary = summary.loc[model] | |
| report.append(f"\n### {model}\n") | |
| report.append(f"- **Average Original Accuracy**: {model_data['orig_accuracy'].mean():.3f}\n") | |
| report.append(f"- **Average Consistency**: {model_data['consistency'].mean():.3f}\n") | |
| report.append(f"- **Average Robustness Score**: {model_summary['robustness_score'].mean():.3f}\n") | |
| # Strengths and weaknesses | |
| best_test = model_data.loc[model_data['consistency'].idxmax(), 'test'] | |
| worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test'] | |
| report.append(f"- **Strongest Against**: {best_test} ({model_data['consistency'].max():.3f} consistency)\n") | |
| report.append(f"- **Weakest Against**: {worst_test} ({model_data['consistency'].min():.3f} consistency)\n") | |
| # Perturbation analysis | |
| report.append("\n## Perturbation Impact Analysis\n") | |
| pert_types = set([t.split('_', 1)[1] for t in df['test'].unique()]) | |
| for pert_type in sorted(pert_types): | |
| pert_data = df[df['test'].str.contains(pert_type)] | |
| report.append(f"\n### {pert_type.replace('_', ' ').title()}\n") | |
| report.append(f"- **Average Consistency**: {pert_data['consistency'].mean():.3f}\n") | |
| report.append(f"- **Average Accuracy Drop**: {pert_data['accuracy_drop'].mean():.3f}\n") | |
| report.append(f"- **Most Affected Model**: {pert_data.groupby('model')['consistency'].mean().idxmin()}\n") | |
| return ''.join(report) | |
| def visualize_results(df: pd.DataFrame, summary: pd.DataFrame): | |
| """Create visualizations of robustness test results""" | |
| try: | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| # Set style | |
| plt.style.use('seaborn-v0_8-darkgrid') | |
| sns.set_palette("husl") | |
| fig, axes = plt.subplots(2, 3, figsize=(18, 10)) | |
| # 1. Consistency across perturbations | |
| consistency_data = df.pivot_table( | |
| index='model', | |
| columns='test', | |
| values='consistency' | |
| ) | |
| sns.heatmap(consistency_data, annot=True, fmt='.2f', cmap='RdYlGn', | |
| ax=axes[0, 0], vmin=0.5, vmax=1.0) | |
| axes[0, 0].set_title('Consistency Scores by Model and Test') | |
| axes[0, 0].set_xlabel('') | |
| # 2. Accuracy drop | |
| acc_drop_data = df.pivot_table( | |
| index='model', | |
| columns='test', | |
| values='accuracy_drop' | |
| ) | |
| sns.heatmap(acc_drop_data, annot=True, fmt='.2f', cmap='RdYlGn_r', | |
| ax=axes[0, 1], center=0) | |
| axes[0, 1].set_title('Accuracy Drop by Model and Test') | |
| axes[0, 1].set_xlabel('') | |
| # 3. Average confidence change | |
| conf_change_data = df.pivot_table( | |
| index='model', | |
| columns='test', | |
| values='avg_conf_change' | |
| ) | |
| sns.heatmap(conf_change_data, annot=True, fmt='.3f', cmap='YlOrRd', | |
| ax=axes[0, 2], vmin=0, vmax=0.4) | |
| axes[0, 2].set_title('Average Confidence Change') | |
| axes[0, 2].set_xlabel('') | |
| # 4. Model comparison bar plot | |
| model_avg = df.groupby('model').agg({ | |
| 'consistency': 'mean', | |
| 'correlation': 'mean', | |
| 'accuracy_drop': 'mean' | |
| }).round(3) | |
| model_avg.plot(kind='bar', ax=axes[1, 0]) | |
| axes[1, 0].set_title('Average Metrics by Model') | |
| axes[1, 0].set_xlabel('Model') | |
| axes[1, 0].legend(loc='best') | |
| axes[1, 0].tick_params(axis='x', rotation=45) | |
| # 5. Perturbation impact | |
| test_impact = df.groupby('test').agg({ | |
| 'consistency': 'mean', | |
| 'accuracy_drop': lambda x: abs(x).mean() | |
| }).round(3) | |
| test_impact.plot(kind='bar', ax=axes[1, 1]) | |
| axes[1, 1].set_title('Average Impact by Perturbation Type') | |
| axes[1, 1].set_xlabel('Perturbation') | |
| axes[1, 1].legend(['Consistency', 'Abs Accuracy Drop']) | |
| axes[1, 1].tick_params(axis='x', rotation=45) | |
| # 6. Robustness scores (if available) | |
| if 'robustness_score' in summary.columns: | |
| robust_scores = summary['robustness_score'].dropna().reset_index() | |
| robust_pivot = robust_scores.pivot_table( | |
| index='model', | |
| columns='test', | |
| values='robustness_score' | |
| ) | |
| sns.heatmap(robust_pivot, annot=True, fmt='.2f', cmap='RdYlGn', | |
| ax=axes[1, 2], vmin=0.5, vmax=1.0) | |
| axes[1, 2].set_title('Robustness Scores') | |
| axes[1, 2].set_xlabel('') | |
| plt.tight_layout() | |
| plt.savefig('robustness_analysis.png', dpi=150, bbox_inches='tight') | |
| plt.show() | |
| print("\nVisualization saved to 'robustness_analysis.png'") | |
| except ImportError: | |
| print("\nMatplotlib/Seaborn not installed. Skipping visualization.") | |
| def main(): | |
| """Main execution function""" | |
| # Configuration | |
| config = TestConfig( | |
| use_openai=False, # Set to True and add API key to use OpenAI | |
| openai_api_key=os.getenv("OPENAI_API_KEY"), | |
| max_samples_per_dataset=200, # Reduce for faster testing | |
| random_seed=42 | |
| ) | |
| # Set random seeds | |
| random.seed(config.random_seed) | |
| np.random.seed(config.random_seed) | |
| torch.manual_seed(config.random_seed) | |
| # Models to test | |
| model_names = [ | |
| "distilbert-base-uncased-finetuned-sst-2-english", | |
| "nlptown/bert-base-multilingual-uncased-sentiment", | |
| "cardiffnlp/twitter-roberta-base-sentiment" | |
| ] | |
| # Datasets to use | |
| dataset_names = ["sst2", "imdb"] # Add "amazon_polarity" for more data | |
| # Initialize test suite | |
| test_suite = RobustnessTestSuite(config) | |
| # Load models and datasets | |
| print("Loading models...") | |
| models = test_suite.load_models(model_names) | |
| print("\nLoading datasets...") | |
| datasets = test_suite.load_datasets(dataset_names) | |
| # Run tests | |
| print("\nRunning robustness tests...") | |
| results_df = test_suite.run_tests(models, datasets) | |
| # Create summary table | |
| print("\nGenerating summary table...") | |
| summary_table = test_suite.create_summary_table(results_df) | |
| # Display results | |
| print("\n" + "="*80) | |
| print("ROBUSTNESS TEST RESULTS") | |
| print("="*80) | |
| print("\nDetailed Results:") | |
| print(results_df.to_string()) | |
| print("\n" + "="*80) | |
| print("SUMMARY TABLE") | |
| print("="*80) | |
| print(summary_table.to_string()) | |
| # Save results | |
| results_df.to_csv("robustness_test_results.csv", index=False) | |
| summary_table.to_csv("robustness_summary.csv") | |
| print("\nResults saved to 'robustness_test_results.csv' and 'robustness_summary.csv'") | |
| # Generate detailed report | |
| detailed_report = test_suite.generate_detailed_report(results_df, summary_table) | |
| with open("robustness_report.md", "w") as f: | |
| f.write(detailed_report) | |
| print("Detailed report saved to 'robustness_report.md'") | |
| # Generate insights | |
| print("\n" + "="*80) | |
| print("KEY INSIGHTS") | |
| print("="*80) | |
| # Find most/least robust models | |
| model_scores = summary_table.groupby(level=0)['robustness_score'].mean().sort_values(ascending=False) | |
| print(f"\nMost robust model: {model_scores.index[0]} (score: {model_scores.iloc[0]:.3f})") | |
| print(f"Least robust model: {model_scores.index[-1]} (score: {model_scores.iloc[-1]:.3f})") | |
| # Find most challenging perturbations | |
| test_impact = summary_table.groupby(level=1)['consistency'].mean().sort_values() | |
| print(f"\nMost challenging perturbation: {test_impact.index[0]} (consistency: {test_impact.iloc[0]:.3f})") | |
| print(f"Least challenging perturbation: {test_impact.index[-1]} (consistency: {test_impact.iloc[-1]:.3f})") | |
| # Generate visualizations | |
| visualize_results(results_df, summary_table) | |
| # Actionable recommendations | |
| print("\n" + "="*80) | |
| print("RECOMMENDATIONS") | |
| print("="*80) | |
| # Analyze each model's weaknesses | |
| for model_name in models.keys(): | |
| model_data = results_df[results_df['model'] == model_name] | |
| worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test'] | |
| worst_consistency = model_data['consistency'].min() | |
| print(f"\n{model_name}:") | |
| print(f" Weakest against: {worst_test} (consistency: {worst_consistency:.3f})") | |
| if worst_consistency < 0.7: | |
| print(" ⚠️ Critical vulnerability detected - consider:") | |
| if 'negation' in worst_test: | |
| print(" - Add negation-aware training examples") | |
| print(" - Implement linguistic preprocessing for negation handling") | |
| elif 'sentiment_swap' in worst_test: | |
| print(" - Augment training with antonym substitutions") | |
| print(" - Add lexicon-based sentiment verification") | |
| elif worst_consistency < 0.9: | |
| print(" ⚡ Moderate vulnerability - consider data augmentation") | |
| else: | |
| print(" ✅ Generally robust") | |
| print("\n" + "="*80) | |
| print("STATISTICAL SUMMARY") | |
| print("="*80) | |
| print("\nOverall Statistics:") | |
| print(f"Average consistency across all tests: {results_df['consistency'].mean():.3f}") | |
| print(f"Average accuracy drop: {results_df['accuracy_drop'].mean():.3f}") | |
| print(f"Average confidence change: {results_df['avg_conf_change'].mean():.3f}") | |
| print(f"Tests with >10% accuracy drop: {(abs(results_df['accuracy_drop']) > 0.1).sum()}/{len(results_df)}") | |
| # Dataset-specific insights | |
| print("\nDataset-specific performance:") | |
| for dataset in ['sst2', 'imdb']: | |
| dataset_data = results_df[results_df['test'].str.startswith(dataset)] | |
| print(f"\n{dataset.upper()}:") | |
| print(f" Avg consistency: {dataset_data['consistency'].mean():.3f}") | |
| print(f" Avg accuracy: {dataset_data['orig_accuracy'].mean():.3f}") | |
| print(f" Most robust model: {dataset_data.groupby('model')['consistency'].mean().idxmax()}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment