btcross26 · November 25, 2024 14:37
diff --git a/Concept Drift b/Concept Drift
 import pandas as pd
 from evidently.report import Report
 from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
 from evidently.test_suite import TestSuite
 from evidently.tests import DataDriftTest, ConceptDriftTest

 # Load your data
 reference_data = pd.read_csv('reference_data.csv')  # Your initial training data
 current_data = pd.read_csv('current_data.csv')  # Your new production data

 # Assuming 'target' is your model's output column
 target_column = 'target'

 # Create a Data Drift report
 data_drift_report = Report(metrics=[
    DataDriftPreset(),
 ])

 data_drift_report.run(reference_data=reference_data, current_data=current_data)
 data_drift_report.save_html("data_drift_report.html")

 # Create a Concept Drift report
 concept_drift_report = Report(metrics=[
    ConceptDriftPreset(),
 ])

 concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})
 concept_drift_report.save_html("concept_drift_report.html")

 # Create a Test Suite for more detailed analysis
 test_suite = TestSuite(tests=[
    DataDriftTest(),
    ConceptDriftTest()
 ])

 test_suite.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})
 test_suite.save_html("drift_test_suite.html")


 import pandas as pd
 from evidently.report import Report
 from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, ClassificationPerformancePreset
 from evidently.test_suite import TestSuite
 from evidently.tests import DataDriftTest, TargetDriftTest, ClassificationPerformanceTest

 # Load your data
 reference_data = pd.read_csv('reference_data.csv')
 current_data = pd.read_csv('current_data.csv')

 # Assuming 'target' is your actual target column and 'prediction' is your model's prediction
 target_column = 'target'
 prediction_column = 'prediction'

 # Create a Classification Performance report
 classification_performance_report = Report(metrics=[
    ClassificationPerformancePreset(),
 ])

 classification_performance_report.run(reference_data=reference_data, 
                                      current_data=current_data, 
                                      column_mapping={'target': target_column, 
                                                      'prediction': prediction_column})
 classification_performance_report.save_html("classification_performance_report.html")

 # Create a comprehensive Test Suite
 test_suite = TestSuite(tests=[
    DataDriftTest(),
    TargetDriftTest(),
    ClassificationPerformanceTest()
 ])

 test_suite.run(reference_data=reference_data, 
               current_data=current_data, 
               column_mapping={'target': target_column, 
                               'prediction': prediction_column})
 test_suite.save_html("comprehensive_test_suite.html")


 import matplotlib.pyplot as plt
 from evidently.renderers.matplotlib_renderer import MatplotlibRenderer
 from evidently.metrics import DataDriftTable, ConceptDriftTable
 from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
 from evidently.report import Report

 # Create reports
 data_drift_report = Report(metrics=[DataDriftPreset()])
 concept_drift_report = Report(metrics=[ConceptDriftPreset()])

 # Run reports
 data_drift_report.run(reference_data=reference_data, current_data=current_data)
 concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})

 # Create plots
 data_drift_plot = data_drift_report.get_plot(DataDriftTable())
 concept_drift_plot = concept_drift_report.get_plot(ConceptDriftTable())

 # Render plots
 renderer = MatplotlibRenderer()

 fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 16))

 data_drift_plot.render(renderer, ax=ax1)
 ax1.set_title("Data Drift")

 concept_drift_plot.render(renderer, ax=ax2)
 ax2.set_title("Concept Drift")

 plt.tight_layout()
 plt.savefig("drift_plots.png")
 plt.close()


 json_report = report.json()


 from evidently.dashboard import Dashboard
 from evidently.dashboard.tabs import DataDriftTab, ConceptDriftTab

 dashboard = Dashboard(tabs=[DataDriftTab(), ConceptDriftTab()])
 dashboard.calculate(reference_data, current_data, column_mapping={'target': target_column})
 dashboard.save("my_dashboard.html")





 import matplotlib.pyplot as plt
 from matplotlib.backends.backend_pdf import PdfPages
 from evidently.renderers.matplotlib_renderer import MatplotlibRenderer
 from evidently.metrics import DataDriftTable, ConceptDriftTable
 from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
 from evidently.report import Report

 # Create reports
 data_drift_report = Report(metrics=[DataDriftPreset()])
 concept_drift_report = Report(metrics=[ConceptDriftPreset()])

 # Run reports
 data_drift_report.run(reference_data=reference_data, current_data=current_data)
 concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})

 # Create plots
 data_drift_plot = data_drift_report.get_plot(DataDriftTable())
 concept_drift_plot = concept_drift_report.get_plot(ConceptDriftTable())

 # Render plots
 renderer = MatplotlibRenderer()

 # Create PDF
 with PdfPages('drift_plots.pdf') as pdf:
    # Data Drift Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    data_drift_plot.render(renderer, ax=ax)
    ax.set_title("Data Drift")
    plt.tight_layout()
    pdf.savefig(fig)
    plt.close(fig)

    # Concept Drift Plot
    fig, ax = plt.subplots(figsize=(10, 8))
    concept_drift_plot.render(renderer, ax=ax)
    ax.set_title("Concept Drift")
    plt.tight_layout()
    pdf.savefig(fig)
    plt.close(fig)

    # You can add more plots here if needed

 print("PDF created successfully: drift_plots.pdf")


 import numpy as np
 from scipy import stats
 import pandas as pd
 import matplotlib.pyplot as plt

 def monitor_multinomial_distribution(current_dist, benchmark_dist, n_samples, 
                                   alpha=0.05, practical_threshold=0.1):
    """
    Monitor multinomial distribution and detect anomalous categories
    
    Parameters:
    current_dist: dict or Series - Current category proportions
    benchmark_dist: dict or Series - Benchmark category proportions
    n_samples: int - Number of samples in current distribution
    alpha: float - Statistical significance level
    practical_threshold: float - Minimum absolute difference to flag
    """
    # Convert to pandas Series if needed
    current = pd.Series(current_dist)
    benchmark = pd.Series(benchmark_dist)
    
    # Ensure both distributions have same categories
    all_categories = sorted(set(current.index) | set(benchmark.index))
    current = current.reindex(all_categories, fill_value=0)
    benchmark = benchmark.reindex(all_categories, fill_value=0)
    
    # Chi-square test for overall distribution
    chi2, p_value = stats.chisquare(current * n_samples, benchmark * n_samples)
    
    # Calculate per-category statistics
    results = []
    for category in all_categories:
        # Current and benchmark proportions
        p1 = current[category]
        p2 = benchmark[category]
        
        # Calculate standard error
        se = np.sqrt((p2 * (1-p2)) / n_samples)
        
        # Z-score
        z_score = (p1 - p2) / se if se > 0 else 0
        
        # Two-tailed p-value
        p_value_cat = 2 * (1 - stats.norm.cdf(abs(z_score)))
        
        # Absolute difference
        abs_diff = abs(p1 - p2)
        
        # Relative difference (percentage change)
        rel_diff = ((p1 - p2) / p2 * 100) if p2 != 0 else np.inf
        
        # Flag if statistically AND practically significant
        is_anomalous = (p_value_cat < alpha) and (abs_diff > practical_threshold)
        
        results.append({
            'category': category,
            'current_prop': p1,
            'benchmark_prop': p2,
            'abs_difference': abs_diff,
            'relative_diff_pct': rel_diff,
            'z_score': z_score,
            'p_value': p_value_cat,
            'is_anomalous': is_anomalous
        })
    
    results_df = pd.DataFrame(results)
    
    # Create visualization
    plt.figure(figsize=(12, 6))
    
    # Bar plot of differences
    plt.bar(range(len(all_categories)), 
            results_df['current_prop'] - results_df['benchmark_prop'],
            tick_label=all_categories)
    
    # Add threshold lines
    plt.axhline(y=practical_threshold, color='r', linestyle='--', alpha=0.5)
    plt.axhline(y=-practical_threshold, color='r', linestyle='--', alpha=0.5)
    
    plt.title('Category Proportion Differences\n(Current - Benchmark)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    return {
        'overall_chi2_stat': chi2,
        'overall_p_value': p_value,
        'category_results': results_df,
        'plot': plt.gcf()
    }

 # Example usage:
 if __name__ == "__main__":
    # Example data
    benchmark = {
        'A': 0.3,
        'B': 0.3,
        'C': 0.2,
        'D': 0.2
    }
    
    current = {
        'A': 0.35,
        'B': 0.25,
        'C': 0.15,
        'D': 0.25
    }
    
    results = monitor_multinomial_distribution(
        current, 
        benchmark,
        n_samples=1000,
        alpha=0.05,
        practical_threshold=0.05
    )
    
    # Print results
    print("\nOverall Distribution Test:")
    print(f"Chi-square statistic: {results['overall_chi2_stat']:.2f}")
    print(f"P-value: {results['overall_p_value']:.4f}")
    
    print("\nAnomalous Categories:")
    anomalies = results['category_results'][results['category_results']['is_anomalous']]
    if len(anomalies) > 0:
        print(anomalies[['category', 'current_prop', 'benchmark_prop', 
                        'abs_difference', 'relative_diff_pct']])
    else:
        print("No anomalous categories detected")
    
    plt.show()
    
    
    
    
    
    
    
    
    
 def create_alerts(results, abs_threshold=0.05, rel_threshold=20):
    """Generate alerts based on monitoring results"""
    alerts = []
    
    # Overall distribution alert
    if results['overall_p_value'] < 0.05:
        alerts.append({
            'level': 'WARNING',
            'message': f"Overall distribution shows significant change (p={results['overall_p_value']:.4f})"
        })
    
    # Category-specific alerts
    for _, row in results['category_results'].iterrows():
        if row['is_anomalous']:
            alerts.append({
                'level': 'WARNING',
                'category': row['category'],
                'message': f"Category {row['category']} shows significant change: "
                          f"absolute diff={row['abs_difference']:.3f}, "
                          f"relative diff={row['relative_diff_pct']:.1f}%"
            })
    
    return alerts
    
    
    
    
    
    
 import numpy as np
 from scipy import stats
 import pandas as pd
 import matplotlib.pyplot as plt
 from scipy.stats import ks_2samp
 import seaborn as sns

 def monitor_continuous_distribution(
    current_data,
    benchmark_data,
    feature_name,
    quantile_thresholds=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99],
    ks_alpha=0.05,
    practical_threshold_std=0.5):
    """
    Monitor continuous distribution against benchmark
    
    Parameters:
    current_data: array-like - Current distribution samples
    benchmark_data: array-like - Benchmark distribution samples
    feature_name: str - Name of the feature being monitored
    quantile_thresholds: list - Quantiles to monitor
    ks_alpha: float - Significance level for KS test
    practical_threshold_std: float - Threshold for std dev differences
    """
    
    # Convert to numpy arrays
    current = np.array(current_data)
    benchmark = np.array(benchmark_data)
    
    # Basic statistics
    stats_dict = {
        'mean_current': np.mean(current),
        'mean_benchmark': np.mean(benchmark),
        'std_current': np.std(current),
        'std_benchmark': np.std(benchmark),
        'median_current': np.median(current),
        'median_benchmark': np.median(benchmark)
    }
    
    # Calculate quantiles
    current_quantiles = np.quantile(current, quantile_thresholds)
    benchmark_quantiles = np.quantile(benchmark, quantile_thresholds)
    
    # Perform KS test
    ks_stat, ks_pvalue = ks_2samp(current, benchmark)
    
    # Calculate practical differences
    mean_diff = stats_dict['mean_current'] - stats_dict['mean_benchmark']
    std_diff = stats_dict['std_current'] - stats_dict['std_benchmark']
    
    # Standardized differences (in terms of benchmark std dev)
    mean_diff_std = mean_diff / stats_dict['std_benchmark']
    std_diff_relative = std_diff / stats_dict['std_benchmark']
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Density plot
    sns.kdeplot(data=current, label='Current', ax=ax1)
    sns.kdeplot(data=benchmark, label='Benchmark', ax=ax1)
    ax1.set_title(f'Distribution Comparison for {feature_name}')
    ax1.legend()
    
    # QQ plot
    from scipy.stats import probplot
    probplot(current, dist="norm", plot=ax2)
    ax2.set_title('Q-Q Plot of Current Distribution')
    
    plt.tight_layout()
    
    # Create quantile comparison DataFrame
    quantile_df = pd.DataFrame({
        'quantile': quantile_thresholds,
        'current': current_quantiles,
        'benchmark': benchmark_quantiles,
        'abs_diff': np.abs(current_quantiles - benchmark_quantiles),
        'relative_diff': (current_quantiles - benchmark_quantiles) / benchmark_quantiles * 100
    })
    
    # Determine if distribution has shifted significantly
    is_significant = {
        'statistical': ks_pvalue < ks_alpha,
        'practical_mean': abs(mean_diff_std) > practical_threshold_std,
        'practical_std': abs(std_diff_relative) > practical_threshold_std
    }
    
    return {
        'basic_stats': stats_dict,
        'quantile_comparison': quantile_df,
        'ks_test': {'statistic': ks_stat, 'pvalue': ks_pvalue},
        'standardized_differences': {
            'mean_diff_std': mean_diff_std,
            'std_diff_relative': std_diff_relative
        },
        'is_significant': is_significant,
        'plot': fig
    }

 def generate_alerts(results, feature_name):
    """Generate monitoring alerts based on results"""
    alerts = []
    
    # Statistical significance alert
    if results['is_significant']['statistical']:
        alerts.append({
            'level': 'WARNING',
            'message': f"{feature_name}: Distribution change detected (KS test p-value: {results['ks_test']['pvalue']:.4f})"
        })
    
    # Practical significance alerts
    if results['is_significant']['practical_mean']:
        alerts.append({
            'level': 'WARNING',
            'message': f"{feature_name}: Significant mean shift detected ({results['standardized_differences']['mean_diff_std']:.2f} std dev)"
        })
    
    if results['is_significant']['practical_std']:
        alerts.append({
            'level': 'WARNING',
            'message': f"{feature_name}: Significant spread change detected ({results['standardized_differences']['std_diff_relative']:.2f} relative std dev)"
        })
    
    return alerts

 # Example usage:
 if __name__ == "__main__":
    # Generate example data
    np.random.seed(42)
    benchmark_data = np.random.normal(100, 15, 1000)
    # Current data with slight shift and different spread
    current_data = np.random.normal(105, 17, 1000)
    
    # Run monitoring
    results = monitor_continuous_distribution(
        current_data,
        benchmark_data,
        feature_name="Example Feature",
        practical_threshold_std=0.3
    )
    
    # Print results
    print("\nBasic Statistics:")
    for k, v in results['basic_stats'].items():
        print(f"{k}: {v:.2f}")
    
    print("\nQuantile Comparison:")
    print(results['quantile_comparison'])
    
    print("\nKS Test Results:")
    print(f"Statistic: {results['ks_test']['statistic']:.4f}")
    print(f"P-value: {results['ks_test']['pvalue']:.4f}")
    
    print("\nStandardized Differences:")
    print(f"Mean difference (in std dev): {results['standardized_differences']['mean_diff_std']:.2f}")
    print(f"Std dev relative difference: {results['standardized_differences']['std_diff_relative']:.2f}")
    
    # Generate and print alerts
    alerts = generate_alerts(results, "Example Feature")
    if alerts:
        print("\nAlerts Generated:")
        for alert in alerts:
            print(f"{alert['level']}: {alert['message']}")
    
    plt.show()
	import pandas as pd
	from evidently.report import Report
	from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
	from evidently.test_suite import TestSuite
	from evidently.tests import DataDriftTest, ConceptDriftTest

	# Load your data
	reference_data = pd.read_csv('reference_data.csv') # Your initial training data
	current_data = pd.read_csv('current_data.csv') # Your new production data

	# Assuming 'target' is your model's output column
	target_column = 'target'

	# Create a Data Drift report
	data_drift_report = Report(metrics=[
	DataDriftPreset(),
	])

	data_drift_report.run(reference_data=reference_data, current_data=current_data)
	data_drift_report.save_html("data_drift_report.html")

	# Create a Concept Drift report
	concept_drift_report = Report(metrics=[
	ConceptDriftPreset(),
	])

	concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})
	concept_drift_report.save_html("concept_drift_report.html")

	# Create a Test Suite for more detailed analysis
	test_suite = TestSuite(tests=[
	DataDriftTest(),
	ConceptDriftTest()
	])

	test_suite.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})
	test_suite.save_html("drift_test_suite.html")


	import pandas as pd
	from evidently.report import Report
	from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, ClassificationPerformancePreset
	from evidently.test_suite import TestSuite
	from evidently.tests import DataDriftTest, TargetDriftTest, ClassificationPerformanceTest

	# Load your data
	reference_data = pd.read_csv('reference_data.csv')
	current_data = pd.read_csv('current_data.csv')

	# Assuming 'target' is your actual target column and 'prediction' is your model's prediction
	target_column = 'target'
	prediction_column = 'prediction'

	# Create a Classification Performance report
	classification_performance_report = Report(metrics=[
	ClassificationPerformancePreset(),
	])

	classification_performance_report.run(reference_data=reference_data,
	current_data=current_data,
	column_mapping={'target': target_column,
	'prediction': prediction_column})
	classification_performance_report.save_html("classification_performance_report.html")

	# Create a comprehensive Test Suite
	test_suite = TestSuite(tests=[
	DataDriftTest(),
	TargetDriftTest(),
	ClassificationPerformanceTest()
	])

	test_suite.run(reference_data=reference_data,
	current_data=current_data,
	column_mapping={'target': target_column,
	'prediction': prediction_column})
	test_suite.save_html("comprehensive_test_suite.html")


	import matplotlib.pyplot as plt
	from evidently.renderers.matplotlib_renderer import MatplotlibRenderer
	from evidently.metrics import DataDriftTable, ConceptDriftTable
	from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
	from evidently.report import Report

	# Create reports
	data_drift_report = Report(metrics=[DataDriftPreset()])
	concept_drift_report = Report(metrics=[ConceptDriftPreset()])

	# Run reports
	data_drift_report.run(reference_data=reference_data, current_data=current_data)
	concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})

	# Create plots
	data_drift_plot = data_drift_report.get_plot(DataDriftTable())
	concept_drift_plot = concept_drift_report.get_plot(ConceptDriftTable())

	# Render plots
	renderer = MatplotlibRenderer()

	fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 16))

	data_drift_plot.render(renderer, ax=ax1)
	ax1.set_title("Data Drift")

	concept_drift_plot.render(renderer, ax=ax2)
	ax2.set_title("Concept Drift")

	plt.tight_layout()
	plt.savefig("drift_plots.png")
	plt.close()


	json_report = report.json()


	from evidently.dashboard import Dashboard
	from evidently.dashboard.tabs import DataDriftTab, ConceptDriftTab

	dashboard = Dashboard(tabs=[DataDriftTab(), ConceptDriftTab()])
	dashboard.calculate(reference_data, current_data, column_mapping={'target': target_column})
	dashboard.save("my_dashboard.html")





	import matplotlib.pyplot as plt
	from matplotlib.backends.backend_pdf import PdfPages
	from evidently.renderers.matplotlib_renderer import MatplotlibRenderer
	from evidently.metrics import DataDriftTable, ConceptDriftTable
	from evidently.metric_preset import DataDriftPreset, ConceptDriftPreset
	from evidently.report import Report

	# Create reports
	data_drift_report = Report(metrics=[DataDriftPreset()])
	concept_drift_report = Report(metrics=[ConceptDriftPreset()])

	# Run reports
	data_drift_report.run(reference_data=reference_data, current_data=current_data)
	concept_drift_report.run(reference_data=reference_data, current_data=current_data, column_mapping={'target': target_column})

	# Create plots
	data_drift_plot = data_drift_report.get_plot(DataDriftTable())
	concept_drift_plot = concept_drift_report.get_plot(ConceptDriftTable())

	# Render plots
	renderer = MatplotlibRenderer()

	# Create PDF
	with PdfPages('drift_plots.pdf') as pdf:
	# Data Drift Plot
	fig, ax = plt.subplots(figsize=(10, 8))
	data_drift_plot.render(renderer, ax=ax)
	ax.set_title("Data Drift")
	plt.tight_layout()
	pdf.savefig(fig)
	plt.close(fig)

	# Concept Drift Plot
	fig, ax = plt.subplots(figsize=(10, 8))
	concept_drift_plot.render(renderer, ax=ax)
	ax.set_title("Concept Drift")
	plt.tight_layout()
	pdf.savefig(fig)
	plt.close(fig)

	# You can add more plots here if needed

	print("PDF created successfully: drift_plots.pdf")


	import numpy as np
	from scipy import stats
	import pandas as pd
	import matplotlib.pyplot as plt

	def monitor_multinomial_distribution(current_dist, benchmark_dist, n_samples,
	alpha=0.05, practical_threshold=0.1):
	"""
	Monitor multinomial distribution and detect anomalous categories

	Parameters:
	current_dist: dict or Series - Current category proportions
	benchmark_dist: dict or Series - Benchmark category proportions
	n_samples: int - Number of samples in current distribution
	alpha: float - Statistical significance level
	practical_threshold: float - Minimum absolute difference to flag
	"""
	# Convert to pandas Series if needed
	current = pd.Series(current_dist)
	benchmark = pd.Series(benchmark_dist)

	# Ensure both distributions have same categories
	all_categories = sorted(set(current.index) \| set(benchmark.index))
	current = current.reindex(all_categories, fill_value=0)
	benchmark = benchmark.reindex(all_categories, fill_value=0)

	# Chi-square test for overall distribution
	chi2, p_value = stats.chisquare(current * n_samples, benchmark * n_samples)

	# Calculate per-category statistics
	results = []
	for category in all_categories:
	# Current and benchmark proportions
	p1 = current[category]
	p2 = benchmark[category]

	# Calculate standard error
	se = np.sqrt((p2 * (1-p2)) / n_samples)

	# Z-score
	z_score = (p1 - p2) / se if se > 0 else 0

	# Two-tailed p-value
	p_value_cat = 2 * (1 - stats.norm.cdf(abs(z_score)))

	# Absolute difference
	abs_diff = abs(p1 - p2)

	# Relative difference (percentage change)
	rel_diff = ((p1 - p2) / p2 * 100) if p2 != 0 else np.inf

	# Flag if statistically AND practically significant
	is_anomalous = (p_value_cat < alpha) and (abs_diff > practical_threshold)

	results.append({
	'category': category,
	'current_prop': p1,
	'benchmark_prop': p2,
	'abs_difference': abs_diff,
	'relative_diff_pct': rel_diff,
	'z_score': z_score,
	'p_value': p_value_cat,
	'is_anomalous': is_anomalous
	})

	results_df = pd.DataFrame(results)

	# Create visualization
	plt.figure(figsize=(12, 6))

	# Bar plot of differences
	plt.bar(range(len(all_categories)),
	results_df['current_prop'] - results_df['benchmark_prop'],
	tick_label=all_categories)

	# Add threshold lines
	plt.axhline(y=practical_threshold, color='r', linestyle='--', alpha=0.5)
	plt.axhline(y=-practical_threshold, color='r', linestyle='--', alpha=0.5)

	plt.title('Category Proportion Differences\n(Current - Benchmark)')
	plt.xticks(rotation=45)
	plt.tight_layout()

	return {
	'overall_chi2_stat': chi2,
	'overall_p_value': p_value,
	'category_results': results_df,
	'plot': plt.gcf()
	}

	# Example usage:
	if __name__ == "__main__":
	# Example data
	benchmark = {
	'A': 0.3,
	'B': 0.3,
	'C': 0.2,
	'D': 0.2
	}

	current = {
	'A': 0.35,
	'B': 0.25,
	'C': 0.15,
	'D': 0.25
	}

	results = monitor_multinomial_distribution(
	current,
	benchmark,
	n_samples=1000,
	alpha=0.05,
	practical_threshold=0.05
	)

	# Print results
	print("\nOverall Distribution Test:")
	print(f"Chi-square statistic: {results['overall_chi2_stat']:.2f}")
	print(f"P-value: {results['overall_p_value']:.4f}")

	print("\nAnomalous Categories:")
	anomalies = results['category_results'][results['category_results']['is_anomalous']]
	if len(anomalies) > 0:
	print(anomalies[['category', 'current_prop', 'benchmark_prop',
	'abs_difference', 'relative_diff_pct']])
	else:
	print("No anomalous categories detected")

	plt.show()









	def create_alerts(results, abs_threshold=0.05, rel_threshold=20):
	"""Generate alerts based on monitoring results"""
	alerts = []

	# Overall distribution alert
	if results['overall_p_value'] < 0.05:
	alerts.append({
	'level': 'WARNING',
	'message': f"Overall distribution shows significant change (p={results['overall_p_value']:.4f})"
	})

	# Category-specific alerts
	for _, row in results['category_results'].iterrows():
	if row['is_anomalous']:
	alerts.append({
	'level': 'WARNING',
	'category': row['category'],
	'message': f"Category {row['category']} shows significant change: "
	f"absolute diff={row['abs_difference']:.3f}, "
	f"relative diff={row['relative_diff_pct']:.1f}%"
	})

	return alerts






	import numpy as np
	from scipy import stats
	import pandas as pd
	import matplotlib.pyplot as plt
	from scipy.stats import ks_2samp
	import seaborn as sns

	def monitor_continuous_distribution(
	current_data,
	benchmark_data,
	feature_name,
	quantile_thresholds=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99],
	ks_alpha=0.05,
	practical_threshold_std=0.5):
	"""
	Monitor continuous distribution against benchmark

	Parameters:
	current_data: array-like - Current distribution samples
	benchmark_data: array-like - Benchmark distribution samples
	feature_name: str - Name of the feature being monitored
	quantile_thresholds: list - Quantiles to monitor
	ks_alpha: float - Significance level for KS test
	practical_threshold_std: float - Threshold for std dev differences
	"""

	# Convert to numpy arrays
	current = np.array(current_data)
	benchmark = np.array(benchmark_data)

	# Basic statistics
	stats_dict = {
	'mean_current': np.mean(current),
	'mean_benchmark': np.mean(benchmark),
	'std_current': np.std(current),
	'std_benchmark': np.std(benchmark),
	'median_current': np.median(current),
	'median_benchmark': np.median(benchmark)
	}

	# Calculate quantiles
	current_quantiles = np.quantile(current, quantile_thresholds)
	benchmark_quantiles = np.quantile(benchmark, quantile_thresholds)

	# Perform KS test
	ks_stat, ks_pvalue = ks_2samp(current, benchmark)

	# Calculate practical differences
	mean_diff = stats_dict['mean_current'] - stats_dict['mean_benchmark']
	std_diff = stats_dict['std_current'] - stats_dict['std_benchmark']

	# Standardized differences (in terms of benchmark std dev)
	mean_diff_std = mean_diff / stats_dict['std_benchmark']
	std_diff_relative = std_diff / stats_dict['std_benchmark']

	# Create visualization
	fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

	# Density plot
	sns.kdeplot(data=current, label='Current', ax=ax1)
	sns.kdeplot(data=benchmark, label='Benchmark', ax=ax1)
	ax1.set_title(f'Distribution Comparison for {feature_name}')
	ax1.legend()

	# QQ plot
	from scipy.stats import probplot
	probplot(current, dist="norm", plot=ax2)
	ax2.set_title('Q-Q Plot of Current Distribution')

	plt.tight_layout()

	# Create quantile comparison DataFrame
	quantile_df = pd.DataFrame({
	'quantile': quantile_thresholds,
	'current': current_quantiles,
	'benchmark': benchmark_quantiles,
	'abs_diff': np.abs(current_quantiles - benchmark_quantiles),
	'relative_diff': (current_quantiles - benchmark_quantiles) / benchmark_quantiles * 100
	})

	# Determine if distribution has shifted significantly
	is_significant = {
	'statistical': ks_pvalue < ks_alpha,
	'practical_mean': abs(mean_diff_std) > practical_threshold_std,
	'practical_std': abs(std_diff_relative) > practical_threshold_std
	}

	return {
	'basic_stats': stats_dict,
	'quantile_comparison': quantile_df,
	'ks_test': {'statistic': ks_stat, 'pvalue': ks_pvalue},
	'standardized_differences': {
	'mean_diff_std': mean_diff_std,
	'std_diff_relative': std_diff_relative
	},
	'is_significant': is_significant,
	'plot': fig
	}

	def generate_alerts(results, feature_name):
	"""Generate monitoring alerts based on results"""
	alerts = []

	# Statistical significance alert
	if results['is_significant']['statistical']:
	alerts.append({
	'level': 'WARNING',
	'message': f"{feature_name}: Distribution change detected (KS test p-value: {results['ks_test']['pvalue']:.4f})"
	})

	# Practical significance alerts
	if results['is_significant']['practical_mean']:
	alerts.append({
	'level': 'WARNING',
	'message': f"{feature_name}: Significant mean shift detected ({results['standardized_differences']['mean_diff_std']:.2f} std dev)"
	})

	if results['is_significant']['practical_std']:
	alerts.append({
	'level': 'WARNING',
	'message': f"{feature_name}: Significant spread change detected ({results['standardized_differences']['std_diff_relative']:.2f} relative std dev)"
	})

	return alerts

	# Example usage:
	if __name__ == "__main__":
	# Generate example data
	np.random.seed(42)
	benchmark_data = np.random.normal(100, 15, 1000)
	# Current data with slight shift and different spread
	current_data = np.random.normal(105, 17, 1000)

	# Run monitoring
	results = monitor_continuous_distribution(
	current_data,
	benchmark_data,
	feature_name="Example Feature",
	practical_threshold_std=0.3
	)

	# Print results
	print("\nBasic Statistics:")
	for k, v in results['basic_stats'].items():
	print(f"{k}: {v:.2f}")

	print("\nQuantile Comparison:")
	print(results['quantile_comparison'])

	print("\nKS Test Results:")
	print(f"Statistic: {results['ks_test']['statistic']:.4f}")
	print(f"P-value: {results['ks_test']['pvalue']:.4f}")

	print("\nStandardized Differences:")
	print(f"Mean difference (in std dev): {results['standardized_differences']['mean_diff_std']:.2f}")
	print(f"Std dev relative difference: {results['standardized_differences']['std_diff_relative']:.2f}")

	# Generate and print alerts
	alerts = generate_alerts(results, "Example Feature")
	if alerts:
	print("\nAlerts Generated:")
	for alert in alerts:
	print(f"{alert['level']}: {alert['message']}")

	plt.show()
No results found