Last active
February 6, 2025 15:28
-
-
Save martinctc/41d52eb8a29c4f5eadedb80b31695e80 to your computer and use it in GitHub Desktop.
run any statistical tests for two metrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' @title Perform a Statistical Test | |
| #' | |
| #' @description This function performs a statistical test (e.g., chi-squared, t-test) given a data frame, variable names, and any other parameters needed. | |
| #' | |
| #' @details Insert more detailed information here about what the function does, the assumptions it makes, and how it should be used. | |
| #' | |
| #' @param data A data frame containing the variables of interest. | |
| #' @param var1 A string or symbol specifying the first variable. | |
| #' @param var2 A string or symbol specifying the second variable (if applicable). | |
| #' @param ... Additional arguments passed to the underlying test function. | |
| #' | |
| #' @return A list or data frame containing test results, including statistics and p-values. | |
| #' | |
| #' @examples | |
| #' # Example usage: | |
| #' # perform_test(mtcars, "cyl", "gear") | |
| #' | |
| #' @export | |
| perform_test <- function(data, var1, var2, non_parametric = FALSE) { | |
| # Quote the variable names | |
| var1_q <- rlang::enquo(var1) | |
| var2_q <- rlang::enquo(var2) | |
| # Extract the variable names as strings | |
| var1_name <- rlang::quo_name(var1_q) | |
| var2_name <- rlang::quo_name(var2_q) | |
| # Extract the actual data vectors | |
| var1_data <- dplyr::pull(data, !!var1_q) | |
| var2_data <- dplyr::pull(data, !!var2_q) | |
| # Determine variable types | |
| var1_numeric <- is.numeric(var1_data) | |
| var2_numeric <- is.numeric(var2_data) | |
| # Choose appropriate test | |
| if(var1_numeric && var2_numeric) { | |
| # Both numeric: Use correlation test | |
| if(non_parametric) { | |
| test_result <- cor.test(var1_data, var2_data, method = "spearman") | |
| test_name <- "Spearman Correlation" | |
| } else { | |
| test_result <- cor.test(var1_data, var2_data, method = "pearson") | |
| test_name <- "Pearson Correlation" | |
| } | |
| tidy_result <- broom::tidy(test_result) %>% | |
| mutate(estimate_name = "correlation") | |
| } else if(var1_numeric && !var2_numeric) { | |
| # One numeric, one categorical: Use t-test or Wilcoxon test | |
| # Number of levels in categorical variable | |
| num_levels <- n_distinct(var2_data) | |
| if (num_levels == 2) { | |
| # Two levels: t-test or Wilcoxon | |
| if(non_parametric) { | |
| test_result <- wilcox.test(var1_data ~ var2_data) | |
| test_name <- "Wilcoxon Rank Sum Test" | |
| } else { | |
| # Check for equality of variance (homoscedasticity) | |
| var_test <- var.test(var1_data ~ var2_data) | |
| #If the p-value is significant, then variances are unequal, use Welch's t-test | |
| if(var_test$p.value < 0.05){ | |
| test_result <- t.test(var1_data ~ var2_data, var.equal = FALSE) # Welch's t-test | |
| test_name <- "Welch's t-test" | |
| } else { | |
| test_result <- t.test(var1_data ~ var2_data, var.equal = TRUE) # Student's t-test | |
| test_name <- "Student's t-test" | |
| } | |
| } | |
| } else { | |
| # More than two levels: ANOVA or Kruskal-Wallis | |
| if(non_parametric) { | |
| test_result <- kruskal.test(var1_data ~ var2_data) | |
| test_name <- "Kruskal-Wallis Test" | |
| } else { | |
| test_result <- aov(var1_data ~ var2_data) | |
| test_name <- "ANOVA" | |
| } | |
| } | |
| tidy_result <- broom::tidy(test_result) %>% | |
| mutate(estimate_name = ifelse(test_name == "ANOVA", "mean difference", "location shift")) | |
| } else { | |
| stop("Unsupported variable types. Provide one of the following:\n1. Two numeric variables\n2. One numeric and one categorical variable") | |
| } | |
| # Extract relevant statistics | |
| if (test_name %in% c("Pearson Correlation", "Spearman Correlation")) { | |
| statistic <- test_result$statistic | |
| p_value <- test_result$p.value | |
| estimate <- tidy_result$estimate | |
| n <- length(var1_data) | |
| } else if (test_name %in% c("Student's t-test", "Welch's t-test", "Wilcoxon Rank Sum Test")) { | |
| statistic <- test_result$statistic | |
| p_value <- test_result$p.value | |
| estimate <- ifelse(test_name == "Wilcoxon Rank Sum Test", NA, diff(tapply(var1_data, var2_data, mean))) #NA for wilcox | |
| n <- length(var1_data) | |
| } else if (test_name %in% c("Kruskal-Wallis Test", "ANOVA")) { | |
| statistic <- test_result$statistic | |
| p_value <- test_result$p.value | |
| estimate <- NA | |
| n <- length(var1_data) | |
| } | |
| # Determine significance indicators | |
| sig_indicators <- symnum( | |
| p_value, | |
| cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), | |
| symbols = c("***", "**", "*", ".", " ") | |
| ) | |
| # Create result data frame | |
| result_df <- data.frame( | |
| variable1 = var1_name, | |
| variable2 = var2_name, | |
| test = test_name, | |
| statistic = statistic, | |
| p.value = p_value, | |
| significance = as.character(sig_indicators), | |
| estimate = estimate, | |
| n = n | |
| ) | |
| return(result_df) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment