Created
August 28, 2025 11:54
-
-
Save FrankRuns/6de066df7a760dd89ae2130f3b6885ad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Accumulating Risk Analysis: "It's there, you just can't see it" | |
| # Demonstrates how risk accumulates over time and the statistical challenges in detecting it | |
| # Load required libraries | |
| library(ggplot2) | |
| library(dplyr) | |
| library(broom) | |
| # Set seed for reproducibility | |
| set.seed(42) | |
| # ============================================================================= | |
| # PART 1: Define the Risk Models | |
| # ============================================================================= | |
| # Function to generate synthetic data based on accumulating risk model | |
| # P(incident tomorrow) = 1 - exp(lambda0 + kappa * a) | |
| # where: | |
| # - a = days since last shock (incident, audit, or training) | |
| # - lambda0 = risk level right after everyone is paying attention | |
| # - kappa = how much more likely trouble is tomorrow if we go one more day | |
| generate_accumulating_risk_data <- function(n_periods = 1000, | |
| lambda0 = -4, # baseline on cloglog scale | |
| kappa = 0.02, # per-day accumulation on cloglog scale | |
| max_days = 365) { | |
| data <- data.frame( | |
| period = seq_len(n_periods), | |
| days_since_last_event = numeric(n_periods), | |
| incident = logical(n_periods) | |
| ) | |
| days_since <- 0 | |
| for (i in seq_len(n_periods)) { | |
| data$days_since_last_event[i] <- days_since | |
| eta <- lambda0 + kappa * days_since | |
| p <- 1 - exp(-exp(eta)) # cloglog inverse link | |
| incident_occurs <- runif(1) < p | |
| data$incident[i] <- incident_occurs | |
| days_since <- if (incident_occurs) 0 else min(days_since + 1, max_days) | |
| } | |
| data | |
| } | |
| # Function to generate constant risk data (null model) | |
| generate_constant_risk_data <- function(n_periods = 1000, | |
| base_prob = 0.05, # Constant 5% daily risk | |
| max_days = 365) { | |
| data <- data.frame( | |
| period = 1:n_periods, | |
| days_since_last_event = numeric(n_periods), | |
| incident = logical(n_periods) | |
| ) | |
| days_since <- 0 | |
| for (i in 1:n_periods) { | |
| # Store current days_since value | |
| data$days_since_last_event[i] <- days_since | |
| # Constant probability regardless of days_since | |
| prob_incident <- base_prob | |
| # Determine if incident occurs | |
| incident_occurs <- runif(1) < prob_incident | |
| data$incident[i] <- incident_occurs | |
| # Update days_since for next period | |
| if (incident_occurs) { | |
| days_since <- 0 | |
| } else { | |
| days_since <- min(days_since + 1, max_days) | |
| } | |
| } | |
| return(data) | |
| } | |
| # ============================================================================= | |
| # PART 2: Generate Initial Datasets and Test for Significance | |
| # ============================================================================= | |
| cat("=== PART 2: Initial Analysis with Limited Data ===\n") | |
| # Generate datasets | |
| n_initial <- 500 # Relatively small sample | |
| accumulating_data <- generate_accumulating_risk_data(n_initial, | |
| lambda0 = -4, # Baseline on cloglog scale | |
| kappa = 0.01) # Accumulation rate | |
| constant_data <- generate_constant_risk_data(n_initial, | |
| base_prob = 0.02) # 2% daily risk | |
| # Fit logistic regression models | |
| model_accumulating <- glm(incident ~ days_since_last_event, | |
| data = accumulating_data, | |
| family = binomial()) | |
| model_constant <- glm(incident ~ days_since_last_event, | |
| data = constant_data, | |
| family = binomial()) | |
| # Display results | |
| cat("\nAccumulating Risk Model Results:\n") | |
| print(summary(model_accumulating)) | |
| cat("\nConstant Risk Model Results:\n") | |
| print(summary(model_constant)) | |
| # Extract p-values | |
| p_val_accumulating <- summary(model_accumulating)$coefficients[2, 4] | |
| p_val_constant <- summary(model_constant)$coefficients[2, 4] | |
| cat(sprintf("\nP-value for 'days_since_last_event' in accumulating model: %.4f\n", p_val_accumulating)) | |
| cat(sprintf("P-value for 'days_since_last_event' in constant model: %.4f\n", p_val_constant)) | |
| # ============================================================================= | |
| # PART 3: Power Analysis - How Sample Size Affects Detection | |
| # ============================================================================= | |
| cat("\n=== PART 3: Power Analysis - The Detection Problem ===\n") | |
| # Function to test significance at different sample sizes | |
| power_analysis <- function(sample_sizes = seq(100, 3000, by=100), | |
| lambda0 = -3.5, | |
| kappa = 0.02, | |
| alpha = 0.05, | |
| n_simulations = 20) { | |
| results <- data.frame( | |
| sample_size = numeric(), | |
| p_value = numeric(), | |
| significant = logical(), | |
| coefficient = numeric() | |
| ) | |
| for (n in sample_sizes) { | |
| cat(sprintf("Testing sample size: %d\n", n)) | |
| sim_results <- replicate(n_simulations, { | |
| # Generate data | |
| data <- generate_accumulating_risk_data(n, lambda0, kappa) | |
| # Fit model (with error handling) | |
| tryCatch({ | |
| model <- glm(incident ~ days_since_last_event, | |
| data = data, | |
| family = binomial()) | |
| # Extract results | |
| coef_summary <- summary(model)$coefficients | |
| if (nrow(coef_summary) >= 2) { | |
| p_val <- coef_summary[2, 4] | |
| coef_val <- coef_summary[2, 1] | |
| return(c(p_val, coef_val)) | |
| } else { | |
| return(c(NA, NA)) | |
| } | |
| }, error = function(e) { | |
| return(c(NA, NA)) | |
| }) | |
| }, simplify = TRUE) | |
| # Calculate average p-value and coefficient (excluding NAs) | |
| valid_results <- !is.na(sim_results[1, ]) | |
| if (sum(valid_results) > 0) { | |
| avg_p_val <- mean(sim_results[1, valid_results]) | |
| avg_coef <- mean(sim_results[2, valid_results]) | |
| results <- rbind(results, data.frame( | |
| sample_size = n, | |
| p_value = avg_p_val, | |
| significant = avg_p_val < alpha, | |
| coefficient = avg_coef | |
| )) | |
| } | |
| } | |
| return(results) | |
| } | |
| # Run power analysis with cloglog parameters | |
| cat("Running power analysis (this may take a moment)...\n") | |
| power_results <- power_analysis(sample_sizes = seq(500, 5000, by=500), | |
| lambda0 = -4, # Baseline on cloglog scale | |
| kappa = 0.015, # Meaningful accumulation rate | |
| n_simulations = 10) | |
| # Create the key visualization | |
| power_plot <- ggplot(power_results, aes(x = sample_size, y = p_value)) + | |
| geom_line(color = "darkred", size = 1.2) + | |
| geom_point(aes(color = significant), size = 3) + | |
| geom_hline(yintercept = 0.05, linetype = "dashed", color = "blue", alpha = 0.7) + | |
| scale_color_manual(values = c("TRUE" = "green", "FALSE" = "red"), | |
| labels = c("Not Significant", "Significant")) + | |
| labs( | |
| title = "Risk Effect Emerges Only Once Sample Size > N", | |
| subtitle = "The accumulating risk signal becomes detectable with sufficient data", | |
| x = "Number of Observations", | |
| y = "P-value for Days Since Last Event", | |
| color = "Statistical\nSignificance", | |
| caption = "Horizontal line shows α = 0.05 threshold" | |
| ) + | |
| theme_minimal() + | |
| theme( | |
| plot.title = element_text(size = 14, face = "bold"), | |
| plot.subtitle = element_text(size = 11, color = "gray40"), | |
| legend.position = "bottom" | |
| ) + | |
| ylim(0, max(power_results$p_value, na.rm = TRUE) * 1.1) | |
| print(power_plot) | |
| # ============================================================================= | |
| # PART 4: Visualization of Risk Accumulation | |
| # ============================================================================= | |
| cat("\n=== PART 4: Visualizing Risk Accumulation ===\n") | |
| # Create theoretical risk curves with cloglog link | |
| days_sequence <- 0:365 # Full year | |
| lambda0 <- -4 # Baseline on cloglog scale | |
| kappa_values <- c(0, 0.005, 0.01, 0.02) # Accumulation rates | |
| risk_curves <- expand.grid( | |
| days = days_sequence, | |
| kappa = kappa_values | |
| ) %>% | |
| mutate( | |
| eta = lambda0 + kappa * days, | |
| risk_probability = 1 - exp(-exp(eta)), # cloglog inverse link | |
| model_type = ifelse(kappa == 0, "Constant Risk", "Accumulating Risk"), | |
| kappa_label = paste("κ =", kappa) | |
| ) | |
| risk_plot <- ggplot(risk_curves, aes(x = days, y = risk_probability, color = kappa_label)) + | |
| geom_line(size = 1.2) + | |
| labs( | |
| title = "Theoretical Risk Accumulation Over Time", | |
| subtitle = "How incident probability changes with days since last event", | |
| x = "Days Since Last Event", | |
| y = "Probability of Incident Tomorrow", | |
| color = "Risk Parameter" | |
| ) + | |
| theme_minimal() + | |
| theme( | |
| plot.title = element_text(size = 14, face = "bold"), | |
| legend.position = "bottom" | |
| ) + | |
| scale_y_continuous(labels = scales::percent_format()) | |
| print(risk_plot) | |
| # ============================================================================= | |
| # PART 5: Summary and Key Insights | |
| # ============================================================================= | |
| cat("\n=== SUMMARY AND KEY INSIGHTS ===\n") | |
| # Find the threshold where significance emerges | |
| significant_threshold <- power_results$sample_size[which(power_results$significant)[1]] | |
| if (is.na(significant_threshold)) { | |
| significant_threshold <- "Not reached in this analysis" | |
| } | |
| cat(sprintf("Threshold for statistical significance: %s observations\n", significant_threshold)) | |
| cat(sprintf("Initial sample size tested: %d observations\n", n_initial)) | |
| final_significant <- tail(power_results$significant, 1) | |
| if (!is.na(final_significant) && final_significant) { | |
| cat("Result: With sufficient data, the accumulating risk effect becomes detectable.\n") | |
| } else { | |
| cat("Result: Even with larger samples, detection remains challenging.\n") | |
| } | |
| cat("\nKey Takeaways:\n") | |
| cat("1. Rare events make statistical detection difficult\n") | |
| cat("2. Absence of significance ≠ absence of effect\n") | |
| cat("3. The 'it's there, you just can't see it' principle applies\n") | |
| cat("4. Risk may be accumulating even when data suggests otherwise\n") | |
| # Save the workspace for further analysis | |
| cat("\nAnalysis complete. Plots and data objects are ready for use.\n") | |
| # Ultra-compact incident timeline visualization | |
| # Perfect for showing rare binary events with minimal visual clutter | |
| create_compact_timeline <- function(accumulating_data, constant_data) { | |
| # Set up ultra-flat layout | |
| par(mfrow = c(2, 1), | |
| mar = c(1.5, 3, 1.5, 1), # Minimal margins | |
| oma = c(2, 0, 2, 0), # Outer margins for title | |
| bg = "white") | |
| # Elegant color palette | |
| acc_color <- "#E74C3C" # Red for accumulating risk | |
| const_color <- "#3498DB" # Blue for constant risk | |
| # Plot 1: Accumulating Risk Model | |
| plot(accumulating_data$period, rep(1, nrow(accumulating_data)), | |
| type = "n", # Don't plot anything yet | |
| ylim = c(0.5, 1.5), # Narrow vertical range | |
| xlim = c(0, max(accumulating_data$period)), | |
| xlab = "", | |
| ylab = "", | |
| main = sprintf("Accumulating Risk (%d incidents)", sum(accumulating_data$incident)), | |
| axes = FALSE, | |
| cex.main = 1.1, | |
| font.main = 2) | |
| # Add incident markers as clean vertical lines | |
| incident_periods_acc <- accumulating_data$period[accumulating_data$incident] | |
| if(length(incident_periods_acc) > 0) { | |
| segments(incident_periods_acc, rep(0.7, length(incident_periods_acc)), | |
| incident_periods_acc, rep(1.3, length(incident_periods_acc)), | |
| col = acc_color, lwd = 2.5, lend = 2) # Square line ends | |
| } | |
| # Minimal x-axis (just tick marks, no labels on top plot) | |
| axis(1, at = seq(0, max(accumulating_data$period), by = 100), | |
| labels = FALSE, tck = -0.02, col = "gray60") | |
| # Plot 2: Constant Risk Model | |
| plot(constant_data$period, rep(1, nrow(constant_data)), | |
| type = "n", | |
| ylim = c(0.5, 1.5), | |
| xlim = c(0, max(constant_data$period)), | |
| xlab = "Time Period", | |
| ylab = "", | |
| main = sprintf("Constant Risk (%d incidents)", sum(constant_data$incident)), | |
| axes = FALSE, | |
| cex.main = 1.1, | |
| font.main = 2) | |
| # Add incident markers | |
| incident_periods_const <- constant_data$period[constant_data$incident] | |
| if(length(incident_periods_const) > 0) { | |
| segments(incident_periods_const, rep(0.7, length(incident_periods_const)), | |
| incident_periods_const, rep(1.3, length(incident_periods_const)), | |
| col = const_color, lwd = 2.5, lend = 2) | |
| } | |
| # Full x-axis with labels on bottom plot | |
| axis(1, at = seq(0, max(constant_data$period), by = 100), | |
| labels = seq(0, max(constant_data$period), by = 100), | |
| cex.axis = 0.9, col = "gray60") | |
| # Clean overall title | |
| mtext("When Do Incidents Occur?", | |
| outer = TRUE, cex = 1.3, font = 2, line = 0.5, col = "gray20") | |
| # Reset plotting parameters | |
| par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1, oma = c(0, 0, 0, 0)) | |
| } | |
| # Usage: | |
| create_compact_timeline(accumulating_data, constant_data) | |
| cat("Ultra-compact timeline visualization created!\n") | |
| cat("Usage: create_compact_timeline(accumulating_data, constant_data)\n") | |
| cat("\nFeatures:\n") | |
| cat("- Maximally flat layout\n") | |
| cat("- Shows only incident occurrences\n") | |
| cat("- Clean, minimal design\n") | |
| cat("- Perfect for rare events\n") | |
| cat("- No visual clutter\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment