imadmali · December 8, 2018 03:54
diff --git a/welch_t_test.R b/welch_t_test.R
 # data
 x1 <- rnorm(10, 1.1, 3)
 x2 <- rnorm(50, 0.7, 3)
 v_x1 <- var(x1)
 v_x2 <- var(x2)
 n_x1 <- length(x1)
 n_x2 <- length(x2)

 # welch's t-stat
 num <- mean(x1) - mean(x2)
 denom <- sqrt(v_x1/n_x1 + v_x2/n_x2) 
 t_stat <- num/denom

 # degrees of freedom
 num <- denom^4
 denom1 <- ((v_x1/n_x1)^2)/(n_x1-1)
 denom2 <- ((v_x2/n_x2)^2)/(n_x2-1)
 deg_free <- num / (denom1 + denom2)

 # replicate the data generation process under the null hypothesis
 # and compute the test statistic
 t_stat_samples <- c()
 for (i in 1:1e4) {
  y1 <- rnorm(1e3, mean(x1), 3)
  y2 <- rnorm(500, mean(x1), 3)
  num <- mean(y1) - mean(y2)
  denom <- sqrt((var(y1)/length(y1)) + (var(y2)/length(y2)))
  t_stat_samples[i] <- num/denom
 }

 # empirical p-value
 p_val_emp <- length(which(t_stat_samples >= abs(t_stat))) +
  length(which(t_stat_samples <= -abs(t_stat)))
 p_val_emp <- p_val_emp/1e4

 # true p_value
 p_val <- pt(-abs(t_stat), deg_free) * 2

 # check
 t.test(x1, x2)

 # visualize
 hist(t_stat_samples, breaks = 50, col = "#808080", border = FALSE, freq = FALSE,
     main = "Distribution of t-statistic under null",
     xlab = "t-statistic samples")
 crit_val <- qt(1-0.05, deg_free)
 lines(seq(-5,5,length.out = 1e3),
      dt(seq(-5,5,length.out = 1e3), deg_free), col = "#7DBAF5", lwd = 3)
 abline(v = crit_val, lwd = 1, lty = 2)
 abline(v = -crit_val, lwd = 1, lty = 2)
 abline(v = abs(t_stat), col = "#FF6688", lwd = 3)
 abline(v = -abs(t_stat), col = "#FF6688", lwd = 3)
 legend("topright", c("t-stat", "crit value", "t-stat dist"),
       lty = c(1,2,1), lwd = c(2,2,2), col = c("#FF6688","#000000","#7DBAF5"))
	# data
	x1 <- rnorm(10, 1.1, 3)
	x2 <- rnorm(50, 0.7, 3)
	v_x1 <- var(x1)
	v_x2 <- var(x2)
	n_x1 <- length(x1)
	n_x2 <- length(x2)

	# welch's t-stat
	num <- mean(x1) - mean(x2)
	denom <- sqrt(v_x1/n_x1 + v_x2/n_x2)
	t_stat <- num/denom

	# degrees of freedom
	num <- denom^4
	denom1 <- ((v_x1/n_x1)^2)/(n_x1-1)
	denom2 <- ((v_x2/n_x2)^2)/(n_x2-1)
	deg_free <- num / (denom1 + denom2)

	# replicate the data generation process under the null hypothesis
	# and compute the test statistic
	t_stat_samples <- c()
	for (i in 1:1e4) {
	y1 <- rnorm(1e3, mean(x1), 3)
	y2 <- rnorm(500, mean(x1), 3)
	num <- mean(y1) - mean(y2)
	denom <- sqrt((var(y1)/length(y1)) + (var(y2)/length(y2)))
	t_stat_samples[i] <- num/denom
	}

	# empirical p-value
	p_val_emp <- length(which(t_stat_samples >= abs(t_stat))) +
	length(which(t_stat_samples <= -abs(t_stat)))
	p_val_emp <- p_val_emp/1e4

	# true p_value
	p_val <- pt(-abs(t_stat), deg_free) * 2

	# check
	t.test(x1, x2)

	# visualize
	hist(t_stat_samples, breaks = 50, col = "#808080", border = FALSE, freq = FALSE,
	main = "Distribution of t-statistic under null",
	xlab = "t-statistic samples")
	crit_val <- qt(1-0.05, deg_free)
	lines(seq(-5,5,length.out = 1e3),
	dt(seq(-5,5,length.out = 1e3), deg_free), col = "#7DBAF5", lwd = 3)
	abline(v = crit_val, lwd = 1, lty = 2)
	abline(v = -crit_val, lwd = 1, lty = 2)
	abline(v = abs(t_stat), col = "#FF6688", lwd = 3)
	abline(v = -abs(t_stat), col = "#FF6688", lwd = 3)
	legend("topright", c("t-stat", "crit value", "t-stat dist"),
	lty = c(1,2,1), lwd = c(2,2,2), col = c("#FF6688","#000000","#7DBAF5"))
No results found