Skip to content

Instantly share code, notes, and snippets.

@FrankRuns
Created August 28, 2025 11:54
Show Gist options
  • Select an option

  • Save FrankRuns/6de066df7a760dd89ae2130f3b6885ad to your computer and use it in GitHub Desktop.

Select an option

Save FrankRuns/6de066df7a760dd89ae2130f3b6885ad to your computer and use it in GitHub Desktop.
# Accumulating Risk Analysis: "It's there, you just can't see it"
# Demonstrates how risk accumulates over time and the statistical challenges in detecting it
# Load required libraries
library(ggplot2)
library(dplyr)
library(broom)
# Set seed for reproducibility
set.seed(42)
# =============================================================================
# PART 1: Define the Risk Models
# =============================================================================
# Function to generate synthetic data based on accumulating risk model
# P(incident tomorrow) = 1 - exp(lambda0 + kappa * a)
# where:
# - a = days since last shock (incident, audit, or training)
# - lambda0 = risk level right after everyone is paying attention
# - kappa = how much more likely trouble is tomorrow if we go one more day
generate_accumulating_risk_data <- function(n_periods = 1000,
lambda0 = -4, # baseline on cloglog scale
kappa = 0.02, # per-day accumulation on cloglog scale
max_days = 365) {
data <- data.frame(
period = seq_len(n_periods),
days_since_last_event = numeric(n_periods),
incident = logical(n_periods)
)
days_since <- 0
for (i in seq_len(n_periods)) {
data$days_since_last_event[i] <- days_since
eta <- lambda0 + kappa * days_since
p <- 1 - exp(-exp(eta)) # cloglog inverse link
incident_occurs <- runif(1) < p
data$incident[i] <- incident_occurs
days_since <- if (incident_occurs) 0 else min(days_since + 1, max_days)
}
data
}
# Function to generate constant risk data (null model)
generate_constant_risk_data <- function(n_periods = 1000,
base_prob = 0.05, # Constant 5% daily risk
max_days = 365) {
data <- data.frame(
period = 1:n_periods,
days_since_last_event = numeric(n_periods),
incident = logical(n_periods)
)
days_since <- 0
for (i in 1:n_periods) {
# Store current days_since value
data$days_since_last_event[i] <- days_since
# Constant probability regardless of days_since
prob_incident <- base_prob
# Determine if incident occurs
incident_occurs <- runif(1) < prob_incident
data$incident[i] <- incident_occurs
# Update days_since for next period
if (incident_occurs) {
days_since <- 0
} else {
days_since <- min(days_since + 1, max_days)
}
}
return(data)
}
# =============================================================================
# PART 2: Generate Initial Datasets and Test for Significance
# =============================================================================
cat("=== PART 2: Initial Analysis with Limited Data ===\n")
# Generate datasets
n_initial <- 500 # Relatively small sample
accumulating_data <- generate_accumulating_risk_data(n_initial,
lambda0 = -4, # Baseline on cloglog scale
kappa = 0.01) # Accumulation rate
constant_data <- generate_constant_risk_data(n_initial,
base_prob = 0.02) # 2% daily risk
# Fit logistic regression models
model_accumulating <- glm(incident ~ days_since_last_event,
data = accumulating_data,
family = binomial())
model_constant <- glm(incident ~ days_since_last_event,
data = constant_data,
family = binomial())
# Display results
cat("\nAccumulating Risk Model Results:\n")
print(summary(model_accumulating))
cat("\nConstant Risk Model Results:\n")
print(summary(model_constant))
# Extract p-values
p_val_accumulating <- summary(model_accumulating)$coefficients[2, 4]
p_val_constant <- summary(model_constant)$coefficients[2, 4]
cat(sprintf("\nP-value for 'days_since_last_event' in accumulating model: %.4f\n", p_val_accumulating))
cat(sprintf("P-value for 'days_since_last_event' in constant model: %.4f\n", p_val_constant))
# =============================================================================
# PART 3: Power Analysis - How Sample Size Affects Detection
# =============================================================================
cat("\n=== PART 3: Power Analysis - The Detection Problem ===\n")
# Function to test significance at different sample sizes
power_analysis <- function(sample_sizes = seq(100, 3000, by=100),
lambda0 = -3.5,
kappa = 0.02,
alpha = 0.05,
n_simulations = 20) {
results <- data.frame(
sample_size = numeric(),
p_value = numeric(),
significant = logical(),
coefficient = numeric()
)
for (n in sample_sizes) {
cat(sprintf("Testing sample size: %d\n", n))
sim_results <- replicate(n_simulations, {
# Generate data
data <- generate_accumulating_risk_data(n, lambda0, kappa)
# Fit model (with error handling)
tryCatch({
model <- glm(incident ~ days_since_last_event,
data = data,
family = binomial())
# Extract results
coef_summary <- summary(model)$coefficients
if (nrow(coef_summary) >= 2) {
p_val <- coef_summary[2, 4]
coef_val <- coef_summary[2, 1]
return(c(p_val, coef_val))
} else {
return(c(NA, NA))
}
}, error = function(e) {
return(c(NA, NA))
})
}, simplify = TRUE)
# Calculate average p-value and coefficient (excluding NAs)
valid_results <- !is.na(sim_results[1, ])
if (sum(valid_results) > 0) {
avg_p_val <- mean(sim_results[1, valid_results])
avg_coef <- mean(sim_results[2, valid_results])
results <- rbind(results, data.frame(
sample_size = n,
p_value = avg_p_val,
significant = avg_p_val < alpha,
coefficient = avg_coef
))
}
}
return(results)
}
# Run power analysis with cloglog parameters
cat("Running power analysis (this may take a moment)...\n")
power_results <- power_analysis(sample_sizes = seq(500, 5000, by=500),
lambda0 = -4, # Baseline on cloglog scale
kappa = 0.015, # Meaningful accumulation rate
n_simulations = 10)
# Create the key visualization
power_plot <- ggplot(power_results, aes(x = sample_size, y = p_value)) +
geom_line(color = "darkred", size = 1.2) +
geom_point(aes(color = significant), size = 3) +
geom_hline(yintercept = 0.05, linetype = "dashed", color = "blue", alpha = 0.7) +
scale_color_manual(values = c("TRUE" = "green", "FALSE" = "red"),
labels = c("Not Significant", "Significant")) +
labs(
title = "Risk Effect Emerges Only Once Sample Size > N",
subtitle = "The accumulating risk signal becomes detectable with sufficient data",
x = "Number of Observations",
y = "P-value for Days Since Last Event",
color = "Statistical\nSignificance",
caption = "Horizontal line shows α = 0.05 threshold"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 11, color = "gray40"),
legend.position = "bottom"
) +
ylim(0, max(power_results$p_value, na.rm = TRUE) * 1.1)
print(power_plot)
# =============================================================================
# PART 4: Visualization of Risk Accumulation
# =============================================================================
cat("\n=== PART 4: Visualizing Risk Accumulation ===\n")
# Create theoretical risk curves with cloglog link
days_sequence <- 0:365 # Full year
lambda0 <- -4 # Baseline on cloglog scale
kappa_values <- c(0, 0.005, 0.01, 0.02) # Accumulation rates
risk_curves <- expand.grid(
days = days_sequence,
kappa = kappa_values
) %>%
mutate(
eta = lambda0 + kappa * days,
risk_probability = 1 - exp(-exp(eta)), # cloglog inverse link
model_type = ifelse(kappa == 0, "Constant Risk", "Accumulating Risk"),
kappa_label = paste("κ =", kappa)
)
risk_plot <- ggplot(risk_curves, aes(x = days, y = risk_probability, color = kappa_label)) +
geom_line(size = 1.2) +
labs(
title = "Theoretical Risk Accumulation Over Time",
subtitle = "How incident probability changes with days since last event",
x = "Days Since Last Event",
y = "Probability of Incident Tomorrow",
color = "Risk Parameter"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
legend.position = "bottom"
) +
scale_y_continuous(labels = scales::percent_format())
print(risk_plot)
# =============================================================================
# PART 5: Summary and Key Insights
# =============================================================================
cat("\n=== SUMMARY AND KEY INSIGHTS ===\n")
# Find the threshold where significance emerges
significant_threshold <- power_results$sample_size[which(power_results$significant)[1]]
if (is.na(significant_threshold)) {
significant_threshold <- "Not reached in this analysis"
}
cat(sprintf("Threshold for statistical significance: %s observations\n", significant_threshold))
cat(sprintf("Initial sample size tested: %d observations\n", n_initial))
final_significant <- tail(power_results$significant, 1)
if (!is.na(final_significant) && final_significant) {
cat("Result: With sufficient data, the accumulating risk effect becomes detectable.\n")
} else {
cat("Result: Even with larger samples, detection remains challenging.\n")
}
cat("\nKey Takeaways:\n")
cat("1. Rare events make statistical detection difficult\n")
cat("2. Absence of significance ≠ absence of effect\n")
cat("3. The 'it's there, you just can't see it' principle applies\n")
cat("4. Risk may be accumulating even when data suggests otherwise\n")
# Save the workspace for further analysis
cat("\nAnalysis complete. Plots and data objects are ready for use.\n")
# Ultra-compact incident timeline visualization
# Perfect for showing rare binary events with minimal visual clutter
create_compact_timeline <- function(accumulating_data, constant_data) {
# Set up ultra-flat layout
par(mfrow = c(2, 1),
mar = c(1.5, 3, 1.5, 1), # Minimal margins
oma = c(2, 0, 2, 0), # Outer margins for title
bg = "white")
# Elegant color palette
acc_color <- "#E74C3C" # Red for accumulating risk
const_color <- "#3498DB" # Blue for constant risk
# Plot 1: Accumulating Risk Model
plot(accumulating_data$period, rep(1, nrow(accumulating_data)),
type = "n", # Don't plot anything yet
ylim = c(0.5, 1.5), # Narrow vertical range
xlim = c(0, max(accumulating_data$period)),
xlab = "",
ylab = "",
main = sprintf("Accumulating Risk (%d incidents)", sum(accumulating_data$incident)),
axes = FALSE,
cex.main = 1.1,
font.main = 2)
# Add incident markers as clean vertical lines
incident_periods_acc <- accumulating_data$period[accumulating_data$incident]
if(length(incident_periods_acc) > 0) {
segments(incident_periods_acc, rep(0.7, length(incident_periods_acc)),
incident_periods_acc, rep(1.3, length(incident_periods_acc)),
col = acc_color, lwd = 2.5, lend = 2) # Square line ends
}
# Minimal x-axis (just tick marks, no labels on top plot)
axis(1, at = seq(0, max(accumulating_data$period), by = 100),
labels = FALSE, tck = -0.02, col = "gray60")
# Plot 2: Constant Risk Model
plot(constant_data$period, rep(1, nrow(constant_data)),
type = "n",
ylim = c(0.5, 1.5),
xlim = c(0, max(constant_data$period)),
xlab = "Time Period",
ylab = "",
main = sprintf("Constant Risk (%d incidents)", sum(constant_data$incident)),
axes = FALSE,
cex.main = 1.1,
font.main = 2)
# Add incident markers
incident_periods_const <- constant_data$period[constant_data$incident]
if(length(incident_periods_const) > 0) {
segments(incident_periods_const, rep(0.7, length(incident_periods_const)),
incident_periods_const, rep(1.3, length(incident_periods_const)),
col = const_color, lwd = 2.5, lend = 2)
}
# Full x-axis with labels on bottom plot
axis(1, at = seq(0, max(constant_data$period), by = 100),
labels = seq(0, max(constant_data$period), by = 100),
cex.axis = 0.9, col = "gray60")
# Clean overall title
mtext("When Do Incidents Occur?",
outer = TRUE, cex = 1.3, font = 2, line = 0.5, col = "gray20")
# Reset plotting parameters
par(mfrow = c(1, 1), mar = c(5, 4, 4, 2) + 0.1, oma = c(0, 0, 0, 0))
}
# Usage:
create_compact_timeline(accumulating_data, constant_data)
cat("Ultra-compact timeline visualization created!\n")
cat("Usage: create_compact_timeline(accumulating_data, constant_data)\n")
cat("\nFeatures:\n")
cat("- Maximally flat layout\n")
cat("- Shows only incident occurrences\n")
cat("- Clean, minimal design\n")
cat("- Perfect for rare events\n")
cat("- No visual clutter\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment