martinctc · February 6, 2025 15:28
diff --git a/run-stats-tests.R b/run-stats-tests.R
 #' @title Perform a Statistical Test
 #'
 #' @description This function performs a statistical test (e.g., chi-squared, t-test) given a data frame, variable names, and any other parameters needed.
 #'
 #' @details Insert more detailed information here about what the function does, the assumptions it makes, and how it should be used.
 #'
 #' @param data A data frame containing the variables of interest.
 #' @param var1 A string or symbol specifying the first variable.
 #' @param var2 A string or symbol specifying the second variable (if applicable).
 #' @param ... Additional arguments passed to the underlying test function.
 #'
 #' @return A list or data frame containing test results, including statistics and p-values.
 #'
 #' @examples
 #' # Example usage:
 #' # perform_test(mtcars, "cyl", "gear")
 #'
 #' @export

 perform_test <- function(data, var1, var2, non_parametric = FALSE) {
  
  # Quote the variable names
  var1_q <- rlang::enquo(var1)
  var2_q <- rlang::enquo(var2)
  
  # Extract the variable names as strings
  var1_name <- rlang::quo_name(var1_q)
  var2_name <- rlang::quo_name(var2_q)

  # Extract the actual data vectors
  var1_data <- dplyr::pull(data, !!var1_q)
  var2_data <- dplyr::pull(data, !!var2_q)

  # Determine variable types
  var1_numeric <- is.numeric(var1_data)
  var2_numeric <- is.numeric(var2_data)

  # Choose appropriate test
  if(var1_numeric && var2_numeric) {
    # Both numeric: Use correlation test
    if(non_parametric) {
      test_result <- cor.test(var1_data, var2_data, method = "spearman")
      test_name <- "Spearman Correlation"
    } else {
      test_result <- cor.test(var1_data, var2_data, method = "pearson")
      test_name <- "Pearson Correlation"
    }
    
    tidy_result <- broom::tidy(test_result) %>%
      mutate(estimate_name = "correlation")
    
  } else if(var1_numeric && !var2_numeric) {
    # One numeric, one categorical: Use t-test or Wilcoxon test
    
    # Number of levels in categorical variable
    num_levels <- n_distinct(var2_data)
    
    if (num_levels == 2) {
      # Two levels: t-test or Wilcoxon
      if(non_parametric) {
        test_result <- wilcox.test(var1_data ~ var2_data)
        test_name <- "Wilcoxon Rank Sum Test"
      } else {
        # Check for equality of variance (homoscedasticity)
        var_test <- var.test(var1_data ~ var2_data)
        
        #If the p-value is significant, then variances are unequal, use Welch's t-test
        if(var_test$p.value < 0.05){
          test_result <- t.test(var1_data ~ var2_data, var.equal = FALSE) # Welch's t-test
          test_name <- "Welch's t-test"
        } else {
          test_result <- t.test(var1_data ~ var2_data, var.equal = TRUE) # Student's t-test
          test_name <- "Student's t-test"
        }
      }
    } else {
      # More than two levels: ANOVA or Kruskal-Wallis
      if(non_parametric) {
        test_result <- kruskal.test(var1_data ~ var2_data)
        test_name <- "Kruskal-Wallis Test"
      } else {
        test_result <- aov(var1_data ~ var2_data)
        test_name <- "ANOVA"
      }
    }
    
    tidy_result <- broom::tidy(test_result) %>%
      mutate(estimate_name = ifelse(test_name == "ANOVA", "mean difference", "location shift"))
    
  } else {
    stop("Unsupported variable types. Provide one of the following:\n1. Two numeric variables\n2. One numeric and one categorical variable")
  }

  # Extract relevant statistics
  if (test_name %in% c("Pearson Correlation", "Spearman Correlation")) {
    statistic <- test_result$statistic
    p_value <- test_result$p.value
    estimate <- tidy_result$estimate
    n <- length(var1_data)
  } else if (test_name %in% c("Student's t-test", "Welch's t-test", "Wilcoxon Rank Sum Test")) {
    statistic <- test_result$statistic
    p_value <- test_result$p.value
    estimate <- ifelse(test_name == "Wilcoxon Rank Sum Test", NA, diff(tapply(var1_data, var2_data, mean))) #NA for wilcox
    n <- length(var1_data)
  }  else if (test_name %in% c("Kruskal-Wallis Test", "ANOVA")) {
    statistic <- test_result$statistic
    p_value <- test_result$p.value
    estimate <- NA
    n <- length(var1_data)
  }
    

  # Determine significance indicators
  sig_indicators <- symnum(
    p_value,
    cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
    symbols = c("***", "**", "*", ".", " ")
  )

  # Create result data frame
  result_df <- data.frame(
    variable1 = var1_name,
    variable2 = var2_name,
    test = test_name,
    statistic = statistic,
    p.value = p_value,
    significance = as.character(sig_indicators),
    estimate = estimate,
    n = n
  )

  return(result_df)
 }
	#' @title Perform a Statistical Test
	#'
	#' @description This function performs a statistical test (e.g., chi-squared, t-test) given a data frame, variable names, and any other parameters needed.
	#'
	#' @details Insert more detailed information here about what the function does, the assumptions it makes, and how it should be used.
	#'
	#' @param data A data frame containing the variables of interest.
	#' @param var1 A string or symbol specifying the first variable.
	#' @param var2 A string or symbol specifying the second variable (if applicable).
	#' @param ... Additional arguments passed to the underlying test function.
	#'
	#' @return A list or data frame containing test results, including statistics and p-values.
	#'
	#' @examples
	#' # Example usage:
	#' # perform_test(mtcars, "cyl", "gear")
	#'
	#' @export

	perform_test <- function(data, var1, var2, non_parametric = FALSE) {

	# Quote the variable names
	var1_q <- rlang::enquo(var1)
	var2_q <- rlang::enquo(var2)

	# Extract the variable names as strings
	var1_name <- rlang::quo_name(var1_q)
	var2_name <- rlang::quo_name(var2_q)

	# Extract the actual data vectors
	var1_data <- dplyr::pull(data, !!var1_q)
	var2_data <- dplyr::pull(data, !!var2_q)

	# Determine variable types
	var1_numeric <- is.numeric(var1_data)
	var2_numeric <- is.numeric(var2_data)

	# Choose appropriate test
	if(var1_numeric && var2_numeric) {
	# Both numeric: Use correlation test
	if(non_parametric) {
	test_result <- cor.test(var1_data, var2_data, method = "spearman")
	test_name <- "Spearman Correlation"
	} else {
	test_result <- cor.test(var1_data, var2_data, method = "pearson")
	test_name <- "Pearson Correlation"
	}

	tidy_result <- broom::tidy(test_result) %>%
	mutate(estimate_name = "correlation")

	} else if(var1_numeric && !var2_numeric) {
	# One numeric, one categorical: Use t-test or Wilcoxon test

	# Number of levels in categorical variable
	num_levels <- n_distinct(var2_data)

	if (num_levels == 2) {
	# Two levels: t-test or Wilcoxon
	if(non_parametric) {
	test_result <- wilcox.test(var1_data ~ var2_data)
	test_name <- "Wilcoxon Rank Sum Test"
	} else {
	# Check for equality of variance (homoscedasticity)
	var_test <- var.test(var1_data ~ var2_data)

	#If the p-value is significant, then variances are unequal, use Welch's t-test
	if(var_test$p.value < 0.05){
	test_result <- t.test(var1_data ~ var2_data, var.equal = FALSE) # Welch's t-test
	test_name <- "Welch's t-test"
	} else {
	test_result <- t.test(var1_data ~ var2_data, var.equal = TRUE) # Student's t-test
	test_name <- "Student's t-test"
	}
	}
	} else {
	# More than two levels: ANOVA or Kruskal-Wallis
	if(non_parametric) {
	test_result <- kruskal.test(var1_data ~ var2_data)
	test_name <- "Kruskal-Wallis Test"
	} else {
	test_result <- aov(var1_data ~ var2_data)
	test_name <- "ANOVA"
	}
	}

	tidy_result <- broom::tidy(test_result) %>%
	mutate(estimate_name = ifelse(test_name == "ANOVA", "mean difference", "location shift"))

	} else {
	stop("Unsupported variable types. Provide one of the following:\n1. Two numeric variables\n2. One numeric and one categorical variable")
	}

	# Extract relevant statistics
	if (test_name %in% c("Pearson Correlation", "Spearman Correlation")) {
	statistic <- test_result$statistic
	p_value <- test_result$p.value
	estimate <- tidy_result$estimate
	n <- length(var1_data)
	} else if (test_name %in% c("Student's t-test", "Welch's t-test", "Wilcoxon Rank Sum Test")) {
	statistic <- test_result$statistic
	p_value <- test_result$p.value
	estimate <- ifelse(test_name == "Wilcoxon Rank Sum Test", NA, diff(tapply(var1_data, var2_data, mean))) #NA for wilcox
	n <- length(var1_data)
	} else if (test_name %in% c("Kruskal-Wallis Test", "ANOVA")) {
	statistic <- test_result$statistic
	p_value <- test_result$p.value
	estimate <- NA
	n <- length(var1_data)
	}


	# Determine significance indicators
	sig_indicators <- symnum(
	p_value,
	cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
	symbols = c("*", "", "*", ".", " ")
	)

	# Create result data frame
	result_df <- data.frame(
	variable1 = var1_name,
	variable2 = var2_name,
	test = test_name,
	statistic = statistic,
	p.value = p_value,
	significance = as.character(sig_indicators),
	estimate = estimate,
	n = n
	)

	return(result_df)
	}
No results found