soodoku · August 20, 2025 23:37
diff --git a/sentiment.py b/sentiment.py
 # -*- coding: utf-8 -*-
 """sentiment_ribeiro_style_checklist.ipynb

 Automatically generated by Colab.

 Original file is located at
    https://colab.research.google.com/drive/1uyBg9-sidauCSSO6dcztl552HVSAgkUJ
 """

 """
 Sentiment Model Robustness Testing Script
 Implements Ribeiro-style tests for evaluating model robustness
 """

 import os
 import json
 import random
 import pandas as pd
 import numpy as np
 from typing import List, Dict, Tuple, Optional, Callable
 from dataclasses import dataclass
 from collections import defaultdict
 import warnings
 warnings.filterwarnings('ignore')

 # Core dependencies
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 import torch
 from datasets import load_dataset
 from tqdm import tqdm
 import openai
 from scipy.stats import pearsonr
 import re

 # Configuration
 @dataclass
 class TestConfig:
    """Configuration for robustness tests"""
    use_openai: bool = False
    openai_api_key: Optional[str] = None
    max_samples_per_dataset: int = 500
    random_seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

 class PerturbationGenerator:
    """Generate various perturbations for robustness testing"""

    def __init__(self, config: TestConfig):
        self.config = config
        if config.use_openai and config.openai_api_key:
            openai.api_key = config.openai_api_key

        # Intensity modifiers
        self.intensifiers = {
            'positive': ['absolutely', 'extremely', 'incredibly', 'totally', 'really', 'very'],
            'negative': ['somewhat', 'slightly', 'a bit', 'kind of', 'sort of', 'fairly']
        }

        # Spurious words that shouldn't affect sentiment
        self.spurious_words = [
            'literally', 'basically', 'actually', 'obviously', 'frankly',
            'honestly', 'seriously', 'definitely', 'certainly', 'surely'
        ]

        # Negation words
        self.negation_words = ['not', "n't", 'never', 'no', 'none', 'neither', 'nor']

    def add_intensity_modifier(self, text: str, sentiment: str, intensity_type: str = 'positive') -> str:
        """Add intensity modifiers to text"""
        words = text.split()

        # Find adjectives/adverbs (simplified heuristic)
        adj_positions = []
        for i, word in enumerate(words):
            if any(word.endswith(suffix) for suffix in ['ful', 'less', 'ous', 'ive', 'able', 'ed']):
                adj_positions.append(i)

        if adj_positions:
            pos = random.choice(adj_positions)
            modifier = random.choice(self.intensifiers[intensity_type])
            words.insert(pos, modifier)

        return ' '.join(words)

    def add_spurious_words(self, text: str, num_words: int = 2) -> str:
        """Insert spurious words that shouldn't affect sentiment"""
        words = text.split()
        for _ in range(min(num_words, len(self.spurious_words))):
            spurious = random.choice(self.spurious_words)
            position = random.randint(0, len(words))
            words.insert(position, spurious)
        return ' '.join(words)

    def add_negation(self, text: str) -> str:
        """Add negation to change sentiment"""
        words = text.split()

        # Find verbs or adjectives to negate
        neg_positions = []
        for i, word in enumerate(words):
            if i > 0 and word in ['is', 'was', 'are', 'were', 'am', 'be', 'been']:
                neg_positions.append(i)

        if neg_positions:
            pos = random.choice(neg_positions)
            words.insert(pos + 1, 'not')
        else:
            # Fallback: add "not" at the beginning
            words.insert(0, "Not")

        return ' '.join(words)

    def generate_counterfactual_openai(self, text: str, target_sentiment: str) -> Optional[str]:
        """Use OpenAI to generate counterfactual with opposite sentiment"""
        if not self.config.use_openai:
            return None

        try:
            prompt = f"""
            Rewrite the following text to have {target_sentiment} sentiment while keeping the same topic and structure:
            Original: {text}
            Rewritten:
            """

            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that rewrites text sentiment."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=150,
                temperature=0.7
            )

            return response.choices[0].message['content'].strip()
        except Exception as e:
            print(f"OpenAI API error: {e}")
            return None

    def swap_sentiment_words(self, text: str) -> str:
        """Simple word-level sentiment swapping"""
        positive_words = {
            'good': 'bad', 'great': 'terrible', 'excellent': 'awful',
            'love': 'hate', 'beautiful': 'ugly', 'amazing': 'horrible',
            'wonderful': 'dreadful', 'fantastic': 'disastrous', 'best': 'worst'
        }
        negative_words = {v: k for k, v in positive_words.items()}

        words = text.split()
        for i, word in enumerate(words):
            word_lower = word.lower().strip('.,!?')
            if word_lower in positive_words:
                words[i] = word.replace(word_lower, positive_words[word_lower])
            elif word_lower in negative_words:
                words[i] = word.replace(word_lower, negative_words[word_lower])

        return ' '.join(words)

 class ModelEvaluator:
    """Evaluate models on original and perturbed data"""

    def __init__(self, models: Dict[str, pipeline], config: TestConfig):
        self.models = models
        self.config = config
        self.results = defaultdict(lambda: defaultdict(list))

    def predict_sentiment(self, model: pipeline, texts: List[str]) -> List[Dict]:
        """Get sentiment predictions from model"""
        predictions = []
        model_name = model.model.name_or_path if hasattr(model.model, 'name_or_path') else ""

        for text in tqdm(texts, desc="Predicting", leave=False):
            try:
                result = model(text[:512])  # Truncate long texts

                # Handle different output formats
                if isinstance(result, list) and len(result) > 0:
                    result = result[0]

                # Special handling for nlptown model (5-star ratings)
                if 'nlptown' in model_name:
                    # This model outputs 1-5 star ratings
                    if isinstance(result, dict):
                        label = result['label']
                        # Convert stars to sentiment: 1-2 stars = negative, 4-5 = positive, 3 = neutral
                        if '1' in label or '2' in label:
                            pos_score = 0.2
                            sentiment = 'negative'
                        elif '4' in label or '5' in label:
                            pos_score = 0.8
                            sentiment = 'positive'
                        else:  # 3 stars
                            pos_score = 0.5
                            sentiment = 'neutral'
                    else:
                        # Find the highest scoring label
                        max_score = 0
                        max_label = None
                        for item in result:
                            if item['score'] > max_score:
                                max_score = item['score']
                                max_label = item['label']

                        if '1' in max_label or '2' in max_label:
                            pos_score = 0.2
                            sentiment = 'negative'
                        elif '4' in max_label or '5' in max_label:
                            pos_score = 0.8
                            sentiment = 'positive'
                        else:
                            pos_score = 0.5
                            sentiment = 'neutral'
                else:
                    # Standard sentiment models
                    pos_score = 0.0
                    if isinstance(result, dict):
                        if result['label'].lower() in ['positive', 'pos', '1', 'label_1']:
                            pos_score = result['score']
                        elif result['label'].lower() in ['negative', 'neg', '0', 'label_0']:
                            pos_score = 1 - result['score']
                    else:
                        for item in result:
                            if item['label'].lower() in ['positive', 'pos', '1', 'label_1']:
                                pos_score = item['score']
                                break
                            elif item['label'].lower() in ['negative', 'neg', '0', 'label_0']:
                                pos_score = 1 - item['score']
                                break

                    sentiment = 'positive' if pos_score > 0.5 else 'negative'

                predictions.append({
                    'text': text,
                    'score': pos_score,
                    'label': sentiment
                })
            except Exception as e:
                print(f"Prediction error: {e}")
                predictions.append({
                    'text': text,
                    'score': 0.5,
                    'label': 'neutral'
                })

        return predictions

    def evaluate_perturbation(self,
                            model_name: str,
                            model: pipeline,
                            original_texts: List[str],
                            perturbed_texts: List[str],
                            original_labels: List[int],
                            test_name: str) -> Dict:
        """Evaluate model on original vs perturbed texts"""

        # Get predictions
        orig_preds = self.predict_sentiment(model, original_texts)
        pert_preds = self.predict_sentiment(model, perturbed_texts)

        # Calculate metrics
        orig_accuracy = np.mean([
            (p['label'] == 'positive' and l == 1) or
            (p['label'] == 'negative' and l == 0)
            for p, l in zip(orig_preds, original_labels)
        ])

        pert_accuracy = np.mean([
            (p['label'] == 'positive' and l == 1) or
            (p['label'] == 'negative' and l == 0)
            for p, l in zip(pert_preds, original_labels)
        ])

        # Consistency: how often model gives same prediction
        consistency = np.mean([
            o['label'] == p['label']
            for o, p in zip(orig_preds, pert_preds)
        ])

        # Score correlation
        orig_scores = [p['score'] for p in orig_preds]
        pert_scores = [p['score'] for p in pert_preds]
        correlation, _ = pearsonr(orig_scores, pert_scores)

        # Average confidence change
        conf_change = np.mean([
            abs(o['score'] - p['score'])
            for o, p in zip(orig_preds, pert_preds)
        ])

        return {
            'model': model_name,
            'test': test_name,
            'orig_accuracy': orig_accuracy,
            'pert_accuracy': pert_accuracy,
            'consistency': consistency,
            'correlation': correlation,
            'avg_conf_change': conf_change,
            'accuracy_drop': orig_accuracy - pert_accuracy
        }

 class RobustnessTestSuite:
    """Main test suite for robustness evaluation"""

    def __init__(self, config: TestConfig):
        self.config = config
        self.generator = PerturbationGenerator(config)
        self.results = []

    def load_models(self, model_names: List[str]) -> Dict[str, pipeline]:
        """Load sentiment analysis models"""
        models = {}

        for name in model_names:
            print(f"Loading model: {name}")
            try:
                models[name] = pipeline(
                    "sentiment-analysis",
                    model=name,
                    device=0 if self.config.device == "cuda" else -1
                )
            except Exception as e:
                print(f"Failed to load {name}: {e}")

        return models

    def load_datasets(self, dataset_names: List[str]) -> List[Tuple[List[str], List[int], str]]:
        """Load sentiment datasets"""
        datasets = []

        for name in dataset_names:
            print(f"Loading dataset: {name}")
            try:
                if name == "imdb":
                    ds = load_dataset("imdb", split="test")
                    texts = ds['text'][:self.config.max_samples_per_dataset]
                    labels = ds['label'][:self.config.max_samples_per_dataset]

                elif name == "sst2":
                    ds = load_dataset("glue", "sst2", split="validation")
                    texts = ds['sentence'][:self.config.max_samples_per_dataset]
                    labels = ds['label'][:self.config.max_samples_per_dataset]

                elif name == "amazon_polarity":
                    ds = load_dataset("amazon_polarity", split="test")
                    texts = [t[:1000] for t in ds['content'][:self.config.max_samples_per_dataset]]  # Truncate
                    labels = ds['label'][:self.config.max_samples_per_dataset]

                else:
                    continue

                datasets.append((texts, labels, name))

            except Exception as e:
                print(f"Failed to load {name}: {e}")

        return datasets

    def run_tests(self, models: Dict[str, pipeline], datasets: List[Tuple]) -> pd.DataFrame:
        """Run all robustness tests"""
        evaluator = ModelEvaluator(models, self.config)

        tests = [
            ("intensity_positive", lambda t, l: self.generator.add_intensity_modifier(t, l, 'positive')),
            ("intensity_negative", lambda t, l: self.generator.add_intensity_modifier(t, l, 'negative')),
            ("spurious_words", lambda t, l: self.generator.add_spurious_words(t)),
            ("negation", lambda t, l: self.generator.add_negation(t)),
            ("sentiment_swap", lambda t, l: self.generator.swap_sentiment_words(t))
        ]

        # Add OpenAI counterfactual if enabled
        if self.config.use_openai:
            tests.append(("openai_counterfactual",
                         lambda t, l: self.generator.generate_counterfactual_openai(
                             t, "negative" if l == 1 else "positive") or t))

        for dataset_texts, dataset_labels, dataset_name in datasets:
            print(f"\nTesting on {dataset_name}")

            for test_name, perturbation_fn in tests:
                print(f"  Running {test_name} test...")

                # Generate perturbations
                perturbed_texts = []
                for text, label in zip(dataset_texts, dataset_labels):
                    try:
                        perturbed = perturbation_fn(text, label)
                        perturbed_texts.append(perturbed)
                    except:
                        perturbed_texts.append(text)  # Fallback to original

                # Evaluate each model
                for model_name, model in models.items():
                    result = evaluator.evaluate_perturbation(
                        model_name, model,
                        dataset_texts, perturbed_texts,
                        dataset_labels,
                        f"{dataset_name}_{test_name}"
                    )
                    self.results.append(result)

        # Create results DataFrame
        df = pd.DataFrame(self.results)
        return df

    def create_summary_table(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create summary table of results"""
        # Pivot table for better visualization
        summary = df.pivot_table(
            index=['model', 'test'],
            values=['orig_accuracy', 'pert_accuracy', 'consistency', 'correlation', 'avg_conf_change'],
            aggfunc='mean'
        ).round(3)

        # Add robustness score (composite metric)
        # Handle NaN correlations by using 0.5 as neutral value
        correlation_filled = summary['correlation'].fillna(0.5)

        summary['robustness_score'] = (
            summary['consistency'] * 0.3 +
            correlation_filled * 0.3 +
            (1 - summary['avg_conf_change']) * 0.2 +
            (1 - abs(summary['orig_accuracy'] - summary['pert_accuracy'])) * 0.2
        ).round(3)

        return summary

    def generate_detailed_report(self, df: pd.DataFrame, summary: pd.DataFrame) -> str:
        """Generate a detailed markdown report of the results"""
        report = []
        report.append("# Sentiment Model Robustness Analysis Report\n")
        report.append(f"**Date**: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}\n")
        report.append(f"**Total Tests**: {len(df)}\n")
        report.append(f"**Models Tested**: {df['model'].nunique()}\n")
        report.append(f"**Datasets Used**: {len(set([t.split('_')[0] for t in df['test'].unique()]))}\n\n")

        # Executive Summary
        report.append("## Executive Summary\n")

        # Best performing model
        best_model = summary.groupby(level=0)['robustness_score'].mean().idxmax()
        report.append(f"- **Most Robust Model**: {best_model}\n")

        # Most challenging perturbation
        worst_pert = summary.groupby(level=1)['consistency'].mean().idxmin()
        report.append(f"- **Most Challenging Perturbation**: {worst_pert}\n")

        # Critical vulnerabilities
        critical = df[df['consistency'] < 0.7]
        if not critical.empty:
            report.append(f"- **Critical Vulnerabilities Found**: {len(critical)} tests with <70% consistency\n")

        report.append("\n## Detailed Findings\n")

        # Per-model analysis
        for model in df['model'].unique():
            model_data = df[df['model'] == model]
            model_summary = summary.loc[model]

            report.append(f"\n### {model}\n")
            report.append(f"- **Average Original Accuracy**: {model_data['orig_accuracy'].mean():.3f}\n")
            report.append(f"- **Average Consistency**: {model_data['consistency'].mean():.3f}\n")
            report.append(f"- **Average Robustness Score**: {model_summary['robustness_score'].mean():.3f}\n")

            # Strengths and weaknesses
            best_test = model_data.loc[model_data['consistency'].idxmax(), 'test']
            worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test']

            report.append(f"- **Strongest Against**: {best_test} ({model_data['consistency'].max():.3f} consistency)\n")
            report.append(f"- **Weakest Against**: {worst_test} ({model_data['consistency'].min():.3f} consistency)\n")

        # Perturbation analysis
        report.append("\n## Perturbation Impact Analysis\n")

        pert_types = set([t.split('_', 1)[1] for t in df['test'].unique()])
        for pert_type in sorted(pert_types):
            pert_data = df[df['test'].str.contains(pert_type)]
            report.append(f"\n### {pert_type.replace('_', ' ').title()}\n")
            report.append(f"- **Average Consistency**: {pert_data['consistency'].mean():.3f}\n")
            report.append(f"- **Average Accuracy Drop**: {pert_data['accuracy_drop'].mean():.3f}\n")
            report.append(f"- **Most Affected Model**: {pert_data.groupby('model')['consistency'].mean().idxmin()}\n")

        return ''.join(report)

 def visualize_results(df: pd.DataFrame, summary: pd.DataFrame):
    """Create visualizations of robustness test results"""
    try:
        import matplotlib.pyplot as plt
        import seaborn as sns

        # Set style
        plt.style.use('seaborn-v0_8-darkgrid')
        sns.set_palette("husl")

        fig, axes = plt.subplots(2, 3, figsize=(18, 10))

        # 1. Consistency across perturbations
        consistency_data = df.pivot_table(
            index='model',
            columns='test',
            values='consistency'
        )
        sns.heatmap(consistency_data, annot=True, fmt='.2f', cmap='RdYlGn',
                   ax=axes[0, 0], vmin=0.5, vmax=1.0)
        axes[0, 0].set_title('Consistency Scores by Model and Test')
        axes[0, 0].set_xlabel('')

        # 2. Accuracy drop
        acc_drop_data = df.pivot_table(
            index='model',
            columns='test',
            values='accuracy_drop'
        )
        sns.heatmap(acc_drop_data, annot=True, fmt='.2f', cmap='RdYlGn_r',
                   ax=axes[0, 1], center=0)
        axes[0, 1].set_title('Accuracy Drop by Model and Test')
        axes[0, 1].set_xlabel('')

        # 3. Average confidence change
        conf_change_data = df.pivot_table(
            index='model',
            columns='test',
            values='avg_conf_change'
        )
        sns.heatmap(conf_change_data, annot=True, fmt='.3f', cmap='YlOrRd',
                   ax=axes[0, 2], vmin=0, vmax=0.4)
        axes[0, 2].set_title('Average Confidence Change')
        axes[0, 2].set_xlabel('')

        # 4. Model comparison bar plot
        model_avg = df.groupby('model').agg({
            'consistency': 'mean',
            'correlation': 'mean',
            'accuracy_drop': 'mean'
        }).round(3)

        model_avg.plot(kind='bar', ax=axes[1, 0])
        axes[1, 0].set_title('Average Metrics by Model')
        axes[1, 0].set_xlabel('Model')
        axes[1, 0].legend(loc='best')
        axes[1, 0].tick_params(axis='x', rotation=45)

        # 5. Perturbation impact
        test_impact = df.groupby('test').agg({
            'consistency': 'mean',
            'accuracy_drop': lambda x: abs(x).mean()
        }).round(3)

        test_impact.plot(kind='bar', ax=axes[1, 1])
        axes[1, 1].set_title('Average Impact by Perturbation Type')
        axes[1, 1].set_xlabel('Perturbation')
        axes[1, 1].legend(['Consistency', 'Abs Accuracy Drop'])
        axes[1, 1].tick_params(axis='x', rotation=45)

        # 6. Robustness scores (if available)
        if 'robustness_score' in summary.columns:
            robust_scores = summary['robustness_score'].dropna().reset_index()
            robust_pivot = robust_scores.pivot_table(
                index='model',
                columns='test',
                values='robustness_score'
            )
            sns.heatmap(robust_pivot, annot=True, fmt='.2f', cmap='RdYlGn',
                       ax=axes[1, 2], vmin=0.5, vmax=1.0)
            axes[1, 2].set_title('Robustness Scores')
            axes[1, 2].set_xlabel('')

        plt.tight_layout()
        plt.savefig('robustness_analysis.png', dpi=150, bbox_inches='tight')
        plt.show()
        print("\nVisualization saved to 'robustness_analysis.png'")

    except ImportError:
        print("\nMatplotlib/Seaborn not installed. Skipping visualization.")

 def main():
    """Main execution function"""

    # Configuration
    config = TestConfig(
        use_openai=False,  # Set to True and add API key to use OpenAI
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        max_samples_per_dataset=200,  # Reduce for faster testing
        random_seed=42
    )

    # Set random seeds
    random.seed(config.random_seed)
    np.random.seed(config.random_seed)
    torch.manual_seed(config.random_seed)

    # Models to test
    model_names = [
        "distilbert-base-uncased-finetuned-sst-2-english",
        "nlptown/bert-base-multilingual-uncased-sentiment",
        "cardiffnlp/twitter-roberta-base-sentiment"
    ]

    # Datasets to use
    dataset_names = ["sst2", "imdb"]  # Add "amazon_polarity" for more data

    # Initialize test suite
    test_suite = RobustnessTestSuite(config)

    # Load models and datasets
    print("Loading models...")
    models = test_suite.load_models(model_names)

    print("\nLoading datasets...")
    datasets = test_suite.load_datasets(dataset_names)

    # Run tests
    print("\nRunning robustness tests...")
    results_df = test_suite.run_tests(models, datasets)

    # Create summary table
    print("\nGenerating summary table...")
    summary_table = test_suite.create_summary_table(results_df)

    # Display results
    print("\n" + "="*80)
    print("ROBUSTNESS TEST RESULTS")
    print("="*80)
    print("\nDetailed Results:")
    print(results_df.to_string())

    print("\n" + "="*80)
    print("SUMMARY TABLE")
    print("="*80)
    print(summary_table.to_string())

    # Save results
    results_df.to_csv("robustness_test_results.csv", index=False)
    summary_table.to_csv("robustness_summary.csv")
    print("\nResults saved to 'robustness_test_results.csv' and 'robustness_summary.csv'")

    # Generate detailed report
    detailed_report = test_suite.generate_detailed_report(results_df, summary_table)
    with open("robustness_report.md", "w") as f:
        f.write(detailed_report)
    print("Detailed report saved to 'robustness_report.md'")

    # Generate insights
    print("\n" + "="*80)
    print("KEY INSIGHTS")
    print("="*80)

    # Find most/least robust models
    model_scores = summary_table.groupby(level=0)['robustness_score'].mean().sort_values(ascending=False)
    print(f"\nMost robust model: {model_scores.index[0]} (score: {model_scores.iloc[0]:.3f})")
    print(f"Least robust model: {model_scores.index[-1]} (score: {model_scores.iloc[-1]:.3f})")

    # Find most challenging perturbations
    test_impact = summary_table.groupby(level=1)['consistency'].mean().sort_values()
    print(f"\nMost challenging perturbation: {test_impact.index[0]} (consistency: {test_impact.iloc[0]:.3f})")
    print(f"Least challenging perturbation: {test_impact.index[-1]} (consistency: {test_impact.iloc[-1]:.3f})")

    # Generate visualizations
    visualize_results(results_df, summary_table)

    # Actionable recommendations
    print("\n" + "="*80)
    print("RECOMMENDATIONS")
    print("="*80)

    # Analyze each model's weaknesses
    for model_name in models.keys():
        model_data = results_df[results_df['model'] == model_name]
        worst_test = model_data.loc[model_data['consistency'].idxmin(), 'test']
        worst_consistency = model_data['consistency'].min()

        print(f"\n{model_name}:")
        print(f"  Weakest against: {worst_test} (consistency: {worst_consistency:.3f})")

        if worst_consistency < 0.7:
            print("  ⚠️  Critical vulnerability detected - consider:")
            if 'negation' in worst_test:
                print("     - Add negation-aware training examples")
                print("     - Implement linguistic preprocessing for negation handling")
            elif 'sentiment_swap' in worst_test:
                print("     - Augment training with antonym substitutions")
                print("     - Add lexicon-based sentiment verification")
        elif worst_consistency < 0.9:
            print("  ⚡ Moderate vulnerability - consider data augmentation")
        else:
            print("  ✅ Generally robust")

    print("\n" + "="*80)
    print("STATISTICAL SUMMARY")
    print("="*80)
    print("\nOverall Statistics:")
    print(f"Average consistency across all tests: {results_df['consistency'].mean():.3f}")
    print(f"Average accuracy drop: {results_df['accuracy_drop'].mean():.3f}")
    print(f"Average confidence change: {results_df['avg_conf_change'].mean():.3f}")
    print(f"Tests with >10% accuracy drop: {(abs(results_df['accuracy_drop']) > 0.1).sum()}/{len(results_df)}")

    # Dataset-specific insights
    print("\nDataset-specific performance:")
    for dataset in ['sst2', 'imdb']:
        dataset_data = results_df[results_df['test'].str.startswith(dataset)]
        print(f"\n{dataset.upper()}:")
        print(f"  Avg consistency: {dataset_data['consistency'].mean():.3f}")
        print(f"  Avg accuracy: {dataset_data['orig_accuracy'].mean():.3f}")
        print(f"  Most robust model: {dataset_data.groupby('model')['consistency'].mean().idxmax()}")

 if __name__ == "__main__":
    main()
No results found