Skip to content

Instantly share code, notes, and snippets.

@martinctc
Last active February 6, 2025 15:28
Show Gist options
  • Select an option

  • Save martinctc/41d52eb8a29c4f5eadedb80b31695e80 to your computer and use it in GitHub Desktop.

Select an option

Save martinctc/41d52eb8a29c4f5eadedb80b31695e80 to your computer and use it in GitHub Desktop.
run any statistical tests for two metrics
#' @title Perform a Statistical Test
#'
#' @description This function performs a statistical test (e.g., chi-squared, t-test) given a data frame, variable names, and any other parameters needed.
#'
#' @details Insert more detailed information here about what the function does, the assumptions it makes, and how it should be used.
#'
#' @param data A data frame containing the variables of interest.
#' @param var1 A string or symbol specifying the first variable.
#' @param var2 A string or symbol specifying the second variable (if applicable).
#' @param ... Additional arguments passed to the underlying test function.
#'
#' @return A list or data frame containing test results, including statistics and p-values.
#'
#' @examples
#' # Example usage:
#' # perform_test(mtcars, "cyl", "gear")
#'
#' @export
perform_test <- function(data, var1, var2, non_parametric = FALSE) {
# Quote the variable names
var1_q <- rlang::enquo(var1)
var2_q <- rlang::enquo(var2)
# Extract the variable names as strings
var1_name <- rlang::quo_name(var1_q)
var2_name <- rlang::quo_name(var2_q)
# Extract the actual data vectors
var1_data <- dplyr::pull(data, !!var1_q)
var2_data <- dplyr::pull(data, !!var2_q)
# Determine variable types
var1_numeric <- is.numeric(var1_data)
var2_numeric <- is.numeric(var2_data)
# Choose appropriate test
if(var1_numeric && var2_numeric) {
# Both numeric: Use correlation test
if(non_parametric) {
test_result <- cor.test(var1_data, var2_data, method = "spearman")
test_name <- "Spearman Correlation"
} else {
test_result <- cor.test(var1_data, var2_data, method = "pearson")
test_name <- "Pearson Correlation"
}
tidy_result <- broom::tidy(test_result) %>%
mutate(estimate_name = "correlation")
} else if(var1_numeric && !var2_numeric) {
# One numeric, one categorical: Use t-test or Wilcoxon test
# Number of levels in categorical variable
num_levels <- n_distinct(var2_data)
if (num_levels == 2) {
# Two levels: t-test or Wilcoxon
if(non_parametric) {
test_result <- wilcox.test(var1_data ~ var2_data)
test_name <- "Wilcoxon Rank Sum Test"
} else {
# Check for equality of variance (homoscedasticity)
var_test <- var.test(var1_data ~ var2_data)
#If the p-value is significant, then variances are unequal, use Welch's t-test
if(var_test$p.value < 0.05){
test_result <- t.test(var1_data ~ var2_data, var.equal = FALSE) # Welch's t-test
test_name <- "Welch's t-test"
} else {
test_result <- t.test(var1_data ~ var2_data, var.equal = TRUE) # Student's t-test
test_name <- "Student's t-test"
}
}
} else {
# More than two levels: ANOVA or Kruskal-Wallis
if(non_parametric) {
test_result <- kruskal.test(var1_data ~ var2_data)
test_name <- "Kruskal-Wallis Test"
} else {
test_result <- aov(var1_data ~ var2_data)
test_name <- "ANOVA"
}
}
tidy_result <- broom::tidy(test_result) %>%
mutate(estimate_name = ifelse(test_name == "ANOVA", "mean difference", "location shift"))
} else {
stop("Unsupported variable types. Provide one of the following:\n1. Two numeric variables\n2. One numeric and one categorical variable")
}
# Extract relevant statistics
if (test_name %in% c("Pearson Correlation", "Spearman Correlation")) {
statistic <- test_result$statistic
p_value <- test_result$p.value
estimate <- tidy_result$estimate
n <- length(var1_data)
} else if (test_name %in% c("Student's t-test", "Welch's t-test", "Wilcoxon Rank Sum Test")) {
statistic <- test_result$statistic
p_value <- test_result$p.value
estimate <- ifelse(test_name == "Wilcoxon Rank Sum Test", NA, diff(tapply(var1_data, var2_data, mean))) #NA for wilcox
n <- length(var1_data)
} else if (test_name %in% c("Kruskal-Wallis Test", "ANOVA")) {
statistic <- test_result$statistic
p_value <- test_result$p.value
estimate <- NA
n <- length(var1_data)
}
# Determine significance indicators
sig_indicators <- symnum(
p_value,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " ")
)
# Create result data frame
result_df <- data.frame(
variable1 = var1_name,
variable2 = var2_name,
test = test_name,
statistic = statistic,
p.value = p_value,
significance = as.character(sig_indicators),
estimate = estimate,
n = n
)
return(result_df)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment