Skip to content

Instantly share code, notes, and snippets.

@soodoku
Last active August 20, 2025 23:37
Show Gist options
  • Select an option

  • Save soodoku/0b2d8e84d7c325382381a9c18893c72a to your computer and use it in GitHub Desktop.

Select an option

Save soodoku/0b2d8e84d7c325382381a9c18893c72a to your computer and use it in GitHub Desktop.
Ribeiro Style Robustness Checks
# -*- coding: utf-8 -*-
"""sentiment_ribeiro_style_checklist.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1uyBg9-sidauCSSO6dcztl552HVSAgkUJ
"""
"""
Sentiment Model Robustness Testing Script
Implements Ribeiro-style tests for evaluating model robustness
"""
import os
import json
import random
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional, Callable
from dataclasses import dataclass
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
# Core dependencies
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from datasets import load_dataset
from tqdm import tqdm
import openai
from scipy.stats import pearsonr
import re
# Configuration
@dataclass
class TestConfig:
"""Configuration for robustness tests"""
use_openai: bool = False
openai_api_key: Optional[str] = None
max_samples_per_dataset: int = 500
random_seed: int = 42
device: str = "cuda" if torch.cuda.is_available() else "cpu"
class PerturbationGenerator:
"""Generate various perturbations for robustness testing"""
def __init__(self, config: TestConfig):
self.config = config
if config.use_openai and config.openai_api_key:
openai.api_key = config.openai_api_key
# Intensity modifiers
self.intensifiers = {
'positive': ['absolutely', 'extremely', 'incredibly', 'totally', 'really', 'very'],
'negative': ['somewhat', 'slightly', 'a bit', 'kind of', 'sort of', 'fairly']
}
# Spurious words that shouldn't affect sentiment
self.spurious_words = [
'literally', 'basically', 'actually', 'obviously', 'frankly',
'honestly', 'seriously', 'definitely', 'certainly', 'surely'
]
# Negation words
self.negation_words = ['not', "n't", 'never', 'no', 'none', 'neither', 'nor']
def add_intensity_modifier(self, text: str, sentiment: str, intensity_type: str = 'positive') -> str:
"""Add intensity modifiers to text"""
words = text.split()
# Find adjectives/adverbs (simplified heuristic)
adj_positions = []
for i, word in enumerate(words):
if any(word.endswith(suffix) for suffix in ['ful', 'less', 'ous', 'ive', 'able', 'ed']):
adj_positions.append(i)
if adj_positions:
pos = random.choice(adj_positions)
modifier = random.choice(self.intensifiers[intensity_type])
words.insert(pos, modifier)
return ' '.join(words)
def add_spurious_words(self, text: str, num_words: int = 2) -> str:
"""Insert spurious words that shouldn't affect sentiment"""
words = text.split()
for _ in range(min(num_words, len(self.spurious_words))):
spurious = random.choice(self.spurious_words)
position = random.randint(0, len(words))
words.insert(position, spurious)
return ' '.join(words)
def add_negation(self, text: str) -> str:
"""Add negation to change sentiment"""
words = text.split()
# Find verbs or adjectives to negate
neg_positions = []
for i, word in enumerate(words):
if i > 0 and word in ['is', 'was', 'are', 'were', 'am', 'be', 'been']:
neg_positions.append(i)
if neg_positions:
pos = random.choice(neg_positions)
words.insert(pos + 1, 'not')
else:
# Fallback: add "not" at the beginning
words.insert(0, "Not")
return ' '.join(words)
def generate_counterfactual_openai(self, text: str, target_sentiment: str) -> Optional[str]:
"""Use OpenAI to generate counterfactual with opposite sentiment"""
if not self.config.use_openai:
return None
try:
prompt = f"""
Rewrite the following text to have {target_sentiment} sentiment while keeping the same topic and structure:
Original: {text}
Rewritten:
"""
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant that rewrites text sentiment."},
{"role": "user", "content": prompt}
],
max_tokens=150,
temperature=0.7
)
return response.choices[0].message['content'].strip()
except Exception as e:
print(f"OpenAI API error: {e}")
return None
def swap_sentiment_words(self, text: str) -> str:
"""Simple word-level sentiment swapping"""
positive_words = {
'good': 'bad', 'great': 'terrible', 'excellent': 'awful',
'love': 'hate', 'beautiful': 'ugly', 'amazing': 'horrible',
'wonderful': 'dreadful', 'fantastic': 'disastrous', 'best': 'worst'
}
negative_words = {v: k for k, v in positive_words.items()}
words = text.split()
for i, word in enumerate(words):
word_lower = word.lower().strip('.,!?')
if word_lower in positive_words:
words[i] = word.replace(word_lower, positive_words[word_lower])
elif word_lower in negative_words:
words[i] = word.replace(word_lower, negative_words[word_lower])
return ' '.join(words)
class ModelEvaluator:
"""Evaluate models on original and perturbed data"""
def __init__(self, models: Dict[str, pipeline], config: TestConfig):
self.models = models
self.config = config
self.results = defaultdict(lambda: defaultdict(list))
def predict_sentiment(self, model: pipeline, texts: List[str]) -> List[Dict]:
"""Get sentiment predictions from model"""
predictions = []
model_name = model.model.name_or_path if hasattr(model.model, 'name_or_path') else ""
for text in tqdm(texts, desc="Predicting", leave=False):
try:
result = model(text[:512]) # Truncate long texts
# Handle different output formats
if isinstance(result, list) and len(result) > 0:
result = result[0]
# Special handling for nlptown model (5-star ratings)
if 'nlptown' in model_name:
# This model outputs 1-5 star ratings
if isinstance(result, dict):
label = result['label']
# Convert stars to sentiment: 1-2 stars = negative, 4-5 = positive, 3 = neutral
if '1' in label or '2' in label:
pos_score = 0.2
sentiment = 'negative'
elif '4' in label or '5' in label:
pos_score = 0.8
sentiment = 'positive'
else: # 3 stars
pos_score = 0.5
sentiment = 'neutral'
else:
# Find the highest scoring label
max_score = 0
max_label = None
for item in result:
if item['score'] > max_score:
max_score = item['score']
max_label = item['label']
if '1' in max_label or '2' in max_label:
pos_score = 0.2
sentiment = 'negative'
elif '4' in max_label or '5' in max_label:
pos_score = 0.8
sentiment = 'positive'
else:
pos_score = 0.5
sentiment = 'neutral'
else:
# Standard sentiment models
pos_score = 0.0
if isinstance(result, dict):
if result['label'].lower() in ['positive', 'pos', '1', 'label_1']:
pos_score = result['score']
elif result['label'].lower() in ['negative', 'neg', '0', 'label_0']:
pos_score = 1 - result['score']
else:
for item in result:
if item['label'].lower() in ['positive', 'pos', '1', 'label_1']:
pos_score = item['score']
break
elif item['label'].lower() in ['negative', 'neg', '0', 'label_0']:
pos_score = 1 - item['score']
break
sentiment = 'positive' if pos_score > 0.5 else 'negative'
predictions.append({
'text': text,
'score': pos_score,
'label': sentiment
})
except Exception as e:
print(f"Prediction error: {e}")
predictions.append({
'text': text,
'score': 0.5,
'label': 'neutral'
})
return predictions
def evaluate_perturbation(self,
model_name: str,
model: pipeline,
original_texts: List[str],
perturbed_texts: List[str],
original_labels: List[int],
test_name: str) -> Dict:
"""Evaluate model on original vs perturbed texts"""
# Get predictions
orig_preds = self.predict_sentiment(model, original_texts)
pert_preds = self.predict_sentiment(model, perturbed_texts)
# Calculate metrics
orig_accuracy = np.mean([
(p['label'] == 'positive' and l == 1) or
(p['label'] == 'negative' and l == 0)
for p, l in zip(orig_preds, original_labels)
])
pert_accuracy = np.mean([
(p['label'] == 'positive' and l == 1) or
(p['label'] == 'negative' and l == 0)
for p, l in zip(pert_preds, original_labels)
])
# Consistency: how often model gives same prediction
consistency = np.mean([
o['label'] == p['label']
for o, p in zip(orig_preds, pert_preds)
])
# Score correlation
orig_scores = [p['score'] for p in orig_preds]
pert_scores = [p['score'] for p in pert_preds]
correlation, _ = pearsonr(orig_scores, pert_scores)
# Average confidence change
conf_change = np.mean([
abs(o['score'] - p['score'])
for o, p in zip(orig_preds, pert_preds)
])
return {
'model': model_name,
'test': test_name,
'orig_accuracy': orig_accuracy,
'pert_accuracy': pert_accuracy,
'consistency': consistency,
'correlation': correlation,
'avg_conf_change': conf_change,
'accuracy_drop': orig_accuracy - pert_accuracy
}
class RobustnessTestSuite:
"""Main test suite for robustness evaluation"""
def __init__(self, config: TestConfig):
self.config = config
self.generator = PerturbationGenerator(config)
self.results = []
def load_models(self, model_names: List[str]) -> Dict[str, pipeline]:
"""Load sentiment analysis models"""
models = {}
for name in model_names:
print(f"Loading model: {name}")
try:
models[name] = pipeline(
"sentiment-analysis",
model=name,
device=0 if self.config.device == "cuda" else -1
)
except Exception as e:
print(f"Failed to load {name}: {e}")
return models
def load_datasets(self, dataset_names: List[str]) -> List[Tuple[List[str], List[int], str]]:
"""Load sentiment datasets"""
datasets = []
for name in dataset_names:
print(f"Loading dataset: {name}")
try:
if name == "imdb":
ds = load_dataset("imdb", split="test")
texts = ds['text'][:self.config.max_samples_per_dataset]
labels = ds['label'][:self.config.max_samples_per_dataset]
elif name == "sst2":
ds = load_dataset("glue", "sst2", split="validation")
texts = ds['sentence'][:self.config.max_samples_per_dataset]
labels = ds['label'][:self.config.max_samples_per_dataset]
elif name == "amazon_polarity":
ds = load_dataset("amazon_polarity", split="test")
texts = [t[:1000] for t in ds['content'][:self.config.max_samples_per_dataset]] # Truncate
labels = ds['label'][:self.config.max_samples_per_dataset]
else:
continue
datasets.append((texts, labels, name))
except Exception as e:
print(f"Failed to load {name}: {e}")
return datasets
def run_tests(self, models: Dict[str, pipeline], datasets: List[Tuple]) -> pd.DataFrame:
"""Run all robustness tests"""
evaluator = ModelEvaluator(models, self.config)
tests = [
("intensity_positive", lambda t, l: self.generator.add_intensity_modifier(t, l, 'positive')),
("intensity_negative", lambda t, l: self.generator.add_intensity_modifier(t, l, 'negative')),
("spurious_words", lambda t, l: self.generator.add_spurious_words(t)),
("negation", lambda t, l: self.generator.add_negation(t)),
("sentiment_swap", lambda t, l: self.generator.swap_sentiment_words(t))
]
# Add OpenAI counterfactual if enabled
if self.config.use_openai:
tests.append(("openai_counterfactual",
lambda t, l: self.generator.generate_counterfactual_openai(
t, "negative" if l == 1 else "positive") or t))
for dataset_texts, dataset_labels, dataset_name in datasets:
print(f"\nTesting on {dataset_name}")
for test_name, perturbation_fn in tests:
print(f" Running {test_name} test...")
# Generate perturbations
perturbed_texts = []
for text, label in zip(dataset_texts, dataset_labels):
try:
perturbed = perturbation_fn(text, label)
perturbed_texts.append(perturbed)
except:
perturbed_texts.append(text) # Fallback to original
# Evaluate each model
for model_name, model in models.items():
result = evaluator.evaluate_perturbation(
model_name, model,
dataset_texts, perturbed_texts,
dataset_labels,
f"{dataset_name}_{test_name}"
)
self.results.append(result)
# Create results DataFrame
df = pd.DataFrame(self.results)
return df
def create_summary_table(self, df: pd.DataFrame) -> pd.DataFrame:
"""Create summary table of results"""
# Pivot table for better visualization
summary = df.pivot_table(
index=['model', 'test'],
values=['orig_accuracy', 'pert_accuracy', 'consistency', 'correlation', 'avg_conf_change'],
aggfunc='mean'
).round(3)
# Add robustness score (composite metric)
# Handle NaN correlations by using 0.5 as neutral value
correlation_filled = summary['correlation'].fillna(0.5)
summary['robustness_score'] = (
summary['consistency'] * 0.3 +
correlation_filled * 0.3 +
(1 - summary['avg_conf_change']) * 0.2 +
(1 - abs(summary['orig_accuracy'] - summary['pert_accuracy'])) * 0.2
).round(3)
return summary
def generate_detailed_report(self, df: pd.DataFrame, summary: pd.DataFrame) -> str:
"""Generate a detailed markdown report of the results"""
report = []
report.append("# Sentiment Model Robustness Analysis Report\n")
report.append(f"**Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\n")
report.append(f"**Total Tests**: {len(df)}\n")
report.append(f"**Models Tested**: {df['model'].nunique()}\n")
report.append(f"**Datasets Used**: {len(set([t.split('_')[0] for t in df['test'].unique()]))}\n\n")
# Executive Summary
report.append("## Executive Summary\n")
# Best performing model
best_model = summary.groupby(level=0)['robustness_score'].mean().idxmax()
report.append(f"- **Most Robust Model**: {best_model}\n")
# Most challenging perturbation
worst_pert = summary.groupby(level=1)['consistency'].mean().idxmin()
report.append(f"- **Most Challenging Perturbation**: {worst_pert}\n")
# Critical vulnerabilities
critical = df[df['consistency'] < 0.7]
if not critical.empty:
report.append(f"- **Critical Vulnerabilities Found**: {len(critical)} tests with <70% consistency\n")
report.append("\n## Detailed Findings\n")
# Per-model analysis
for model in df['model'].unique():
model_data = df[df['model'] == model]
model_summary = summary.loc[model]
report.append(f"\n### {model}\n")
report.append(f"- **Average Original Accuracy**: {model_data['orig_accuracy'].mean():.3f}\n")
report.append(f"- **Average Consistency**: {model_data['consistency'].mean():.3f}\n")
report.append(f"- **Average Robustness Score**: {model_summary['robustness_score'].mean():.3f}\n")
# Strengths and weaknesses
best_test = model_data.loc[model_data['consistency'].idxmax(), 'test']
worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test']
report.append(f"- **Strongest Against**: {best_test} ({model_data['consistency'].max():.3f} consistency)\n")
report.append(f"- **Weakest Against**: {worst_test} ({model_data['consistency'].min():.3f} consistency)\n")
# Perturbation analysis
report.append("\n## Perturbation Impact Analysis\n")
pert_types = set([t.split('_', 1)[1] for t in df['test'].unique()])
for pert_type in sorted(pert_types):
pert_data = df[df['test'].str.contains(pert_type)]
report.append(f"\n### {pert_type.replace('_', ' ').title()}\n")
report.append(f"- **Average Consistency**: {pert_data['consistency'].mean():.3f}\n")
report.append(f"- **Average Accuracy Drop**: {pert_data['accuracy_drop'].mean():.3f}\n")
report.append(f"- **Most Affected Model**: {pert_data.groupby('model')['consistency'].mean().idxmin()}\n")
return ''.join(report)
def visualize_results(df: pd.DataFrame, summary: pd.DataFrame):
"""Create visualizations of robustness test results"""
try:
import matplotlib.pyplot as plt
import seaborn as sns
# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
# 1. Consistency across perturbations
consistency_data = df.pivot_table(
index='model',
columns='test',
values='consistency'
)
sns.heatmap(consistency_data, annot=True, fmt='.2f', cmap='RdYlGn',
ax=axes[0, 0], vmin=0.5, vmax=1.0)
axes[0, 0].set_title('Consistency Scores by Model and Test')
axes[0, 0].set_xlabel('')
# 2. Accuracy drop
acc_drop_data = df.pivot_table(
index='model',
columns='test',
values='accuracy_drop'
)
sns.heatmap(acc_drop_data, annot=True, fmt='.2f', cmap='RdYlGn_r',
ax=axes[0, 1], center=0)
axes[0, 1].set_title('Accuracy Drop by Model and Test')
axes[0, 1].set_xlabel('')
# 3. Average confidence change
conf_change_data = df.pivot_table(
index='model',
columns='test',
values='avg_conf_change'
)
sns.heatmap(conf_change_data, annot=True, fmt='.3f', cmap='YlOrRd',
ax=axes[0, 2], vmin=0, vmax=0.4)
axes[0, 2].set_title('Average Confidence Change')
axes[0, 2].set_xlabel('')
# 4. Model comparison bar plot
model_avg = df.groupby('model').agg({
'consistency': 'mean',
'correlation': 'mean',
'accuracy_drop': 'mean'
}).round(3)
model_avg.plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Average Metrics by Model')
axes[1, 0].set_xlabel('Model')
axes[1, 0].legend(loc='best')
axes[1, 0].tick_params(axis='x', rotation=45)
# 5. Perturbation impact
test_impact = df.groupby('test').agg({
'consistency': 'mean',
'accuracy_drop': lambda x: abs(x).mean()
}).round(3)
test_impact.plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Average Impact by Perturbation Type')
axes[1, 1].set_xlabel('Perturbation')
axes[1, 1].legend(['Consistency', 'Abs Accuracy Drop'])
axes[1, 1].tick_params(axis='x', rotation=45)
# 6. Robustness scores (if available)
if 'robustness_score' in summary.columns:
robust_scores = summary['robustness_score'].dropna().reset_index()
robust_pivot = robust_scores.pivot_table(
index='model',
columns='test',
values='robustness_score'
)
sns.heatmap(robust_pivot, annot=True, fmt='.2f', cmap='RdYlGn',
ax=axes[1, 2], vmin=0.5, vmax=1.0)
axes[1, 2].set_title('Robustness Scores')
axes[1, 2].set_xlabel('')
plt.tight_layout()
plt.savefig('robustness_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
print("\nVisualization saved to 'robustness_analysis.png'")
except ImportError:
print("\nMatplotlib/Seaborn not installed. Skipping visualization.")
def main():
"""Main execution function"""
# Configuration
config = TestConfig(
use_openai=False, # Set to True and add API key to use OpenAI
openai_api_key=os.getenv("OPENAI_API_KEY"),
max_samples_per_dataset=200, # Reduce for faster testing
random_seed=42
)
# Set random seeds
random.seed(config.random_seed)
np.random.seed(config.random_seed)
torch.manual_seed(config.random_seed)
# Models to test
model_names = [
"distilbert-base-uncased-finetuned-sst-2-english",
"nlptown/bert-base-multilingual-uncased-sentiment",
"cardiffnlp/twitter-roberta-base-sentiment"
]
# Datasets to use
dataset_names = ["sst2", "imdb"] # Add "amazon_polarity" for more data
# Initialize test suite
test_suite = RobustnessTestSuite(config)
# Load models and datasets
print("Loading models...")
models = test_suite.load_models(model_names)
print("\nLoading datasets...")
datasets = test_suite.load_datasets(dataset_names)
# Run tests
print("\nRunning robustness tests...")
results_df = test_suite.run_tests(models, datasets)
# Create summary table
print("\nGenerating summary table...")
summary_table = test_suite.create_summary_table(results_df)
# Display results
print("\n" + "="*80)
print("ROBUSTNESS TEST RESULTS")
print("="*80)
print("\nDetailed Results:")
print(results_df.to_string())
print("\n" + "="*80)
print("SUMMARY TABLE")
print("="*80)
print(summary_table.to_string())
# Save results
results_df.to_csv("robustness_test_results.csv", index=False)
summary_table.to_csv("robustness_summary.csv")
print("\nResults saved to 'robustness_test_results.csv' and 'robustness_summary.csv'")
# Generate detailed report
detailed_report = test_suite.generate_detailed_report(results_df, summary_table)
with open("robustness_report.md", "w") as f:
f.write(detailed_report)
print("Detailed report saved to 'robustness_report.md'")
# Generate insights
print("\n" + "="*80)
print("KEY INSIGHTS")
print("="*80)
# Find most/least robust models
model_scores = summary_table.groupby(level=0)['robustness_score'].mean().sort_values(ascending=False)
print(f"\nMost robust model: {model_scores.index[0]} (score: {model_scores.iloc[0]:.3f})")
print(f"Least robust model: {model_scores.index[-1]} (score: {model_scores.iloc[-1]:.3f})")
# Find most challenging perturbations
test_impact = summary_table.groupby(level=1)['consistency'].mean().sort_values()
print(f"\nMost challenging perturbation: {test_impact.index[0]} (consistency: {test_impact.iloc[0]:.3f})")
print(f"Least challenging perturbation: {test_impact.index[-1]} (consistency: {test_impact.iloc[-1]:.3f})")
# Generate visualizations
visualize_results(results_df, summary_table)
# Actionable recommendations
print("\n" + "="*80)
print("RECOMMENDATIONS")
print("="*80)
# Analyze each model's weaknesses
for model_name in models.keys():
model_data = results_df[results_df['model'] == model_name]
worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test']
worst_consistency = model_data['consistency'].min()
print(f"\n{model_name}:")
print(f" Weakest against: {worst_test} (consistency: {worst_consistency:.3f})")
if worst_consistency < 0.7:
print(" ⚠️ Critical vulnerability detected - consider:")
if 'negation' in worst_test:
print(" - Add negation-aware training examples")
print(" - Implement linguistic preprocessing for negation handling")
elif 'sentiment_swap' in worst_test:
print(" - Augment training with antonym substitutions")
print(" - Add lexicon-based sentiment verification")
elif worst_consistency < 0.9:
print(" ⚡ Moderate vulnerability - consider data augmentation")
else:
print(" ✅ Generally robust")
print("\n" + "="*80)
print("STATISTICAL SUMMARY")
print("="*80)
print("\nOverall Statistics:")
print(f"Average consistency across all tests: {results_df['consistency'].mean():.3f}")
print(f"Average accuracy drop: {results_df['accuracy_drop'].mean():.3f}")
print(f"Average confidence change: {results_df['avg_conf_change'].mean():.3f}")
print(f"Tests with >10% accuracy drop: {(abs(results_df['accuracy_drop']) > 0.1).sum()}/{len(results_df)}")
# Dataset-specific insights
print("\nDataset-specific performance:")
for dataset in ['sst2', 'imdb']:
dataset_data = results_df[results_df['test'].str.startswith(dataset)]
print(f"\n{dataset.upper()}:")
print(f" Avg consistency: {dataset_data['consistency'].mean():.3f}")
print(f" Avg accuracy: {dataset_data['orig_accuracy'].mean():.3f}")
print(f" Most robust model: {dataset_data.groupby('model')['consistency'].mean().idxmax()}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment