Skip to content

Instantly share code, notes, and snippets.

@owstron
Created February 18, 2018 06:02
Show Gist options
  • Select an option

  • Save owstron/e8bde8a79bee2bfe1deff7cab06081d3 to your computer and use it in GitHub Desktop.

Select an option

Save owstron/e8bde8a79bee2bfe1deff7cab06081d3 to your computer and use it in GitHub Desktop.
Analysis of Lalonde data set using 3 different models. CS112 Lalonde 3 Ways assignment
library('Matching')
library(boot)
library(arm)
library(randomForest)
data(lalonde)
attach(lalonde)
# Normalizing the real earnings by dividing by 1000 dollars.
lalonde$re75 = lalonde$re75 / 1000
lalonde$re78 = lalonde$re78 / 1000
lalonde$re74 = lalonde$re74 / 1000
lalonde.nodegr = subset(lalonde, lalonde$nodegr == 1)
lalonde.degr = subset(lalonde, lalonde$nodegr == 0)
# Quuestion 1
# Finding the best model using cross validation error estimate
set.seed(23)
cv.errors = rep(0,4)
lm1 = glm(re78 ~ age + black + educ + hisp + u75 + married + treat + re75 + u74 +re74, data = lalonde.nodegr)
summary(lm1)
cv.errors[1] = cv.glm(lalonde.nodegr, lm1, K = 10)$delta[1]
lm2 = glm(re78 ~ age + black + educ + u75 + treat + re75 + u74 +re74, data = lalonde.nodegr)
summary(lm2)
cv.errors[2] = cv.glm(lalonde.nodegr, lm2, K = 10)$delta[1]
lm3 = glm(re78 ~ age + black + educ + u75 + treat+ u74, data = lalonde.nodegr)
summary(lm3)
cv.errors[3] = cv.glm(lalonde.nodegr, lm3, K = 10)$delta[1]
lm4 = glm(re78 ~ age + black + u75 + treat, data = lalonde.nodegr)
summary(lm4)
cv.errors[4] = cv.glm(lalonde.nodegr, lm4, K = 10)$delta[1]
names(cv.errors) = c("lm1", "lm2", "lm3", "lm4")
cv.errors #Prints all the cross validation errors for the estimates.
# Lalonde No Degree
lm_nodegr = glm(re78 ~ age + black + u75 + treat , data = lalonde.nodegr)
summary(lm_nodegr)
set.seed(23)
lm_nodegr.cv.error = cv.glm(lalonde.nodegr, lm_nodegr, K = 10)$delta[1]
cat("Cross Validation Error Estimate::", lm_nodegr.cv.error)
lm_nodegr.sim <- sim(lm_nodegr)
coef(lm_nodegr.sim)
predictors_name = c("age", "black", "u75", "treat")
# in coef(lm_nodegr.sim) Columns are : Intercept, Age, Black, U75 and Treat
predictors_ci_lb = c(quantile(coef(lm_nodegr.sim)[,2], c(0.025,0.975))[1],
quantile(coef(lm_nodegr.sim)[,3], c(0.025,0.975))[1],
quantile(coef(lm_nodegr.sim)[,4], c(0.025,0.975))[1],
quantile(coef(lm_nodegr.sim)[,5], c(0.025,0.975))[1])
predictors_ci_ub = c(quantile(coef(lm_nodegr.sim)[,2], c(0.025,0.975))[2],
quantile(coef(lm_nodegr.sim)[,3], c(0.025,0.975))[2],
quantile(coef(lm_nodegr.sim)[,4], c(0.025,0.975))[2],
quantile(coef(lm_nodegr.sim)[,5], c(0.025,0.975))[2])
lm_nodegr.confint = data.frame(predictors_name, predictors_ci_lb, predictors_ci_ub)
names(lm_nodegr.confint) = c("predictors", "lower bound", "upper bound")
lm_nodegr.confint #Confidence interval of all the predictor variable
# For degree holders
lm_degr = glm(re78 ~ age + black + u75 + treat, data = lalonde.degr)
summary(lm_degr)
lm_degr.sim <- sim(lm_degr)
coef(lm_degr.sim)
set.seed(25)
lm_degr.cv.error = cv.glm(lalonde.degr, lm_degr, K = 10)$delta[1]
cat("Cross Validation Error Estimate::", lm_degr.cv.error)
predictors_name = c("age", "black", "u75", "treat")
predictors_ci_lb = c(quantile(coef(lm_degr.sim)[,2], c(0.025,0.975))[1],
quantile(coef(lm_degr.sim)[,3], c(0.025,0.975))[1],
quantile(coef(lm_degr.sim)[,4], c(0.025,0.975))[1],
quantile(coef(lm_degr.sim)[,5], c(0.025,0.975))[1])
predictors_ci_ub = c(quantile(coef(lm_degr.sim)[,2], c(0.025,0.975))[2],
quantile(coef(lm_degr.sim)[,3], c(0.025,0.975))[2],
quantile(coef(lm_degr.sim)[,4], c(0.025,0.975))[2],
quantile(coef(lm_degr.sim)[,5], c(0.025,0.975))[2])
lm_degr.confint = data.frame(predictors_name, predictors_ci_lb, predictors_ci_ub)
names(lm_degr.confint) = c("predictors", "lower bound", "upper bound")
lm_nodegr.confint
# Question 2
lm_effect = glm(re78 ~ treat + nodegr + I(treat * nodegr), data = lalonde)
summary(lm_effect)
lm_effect.sim = sim(lm_effect)
coef(lm_effect.sim)
plot (0, 0, ylim = c(-2, 5), xlim = c(0, 1),
xlab="Degree status (nodegree)", ylab="Treatment Effect",
main="Treatment effect with interacting term")
abline (h = 0, lwd=.5, lty=2) # draws a horizontal line
# So, treatment_effect = coef(interaction.term)*nodegr + coef(treatment) can be rewritten as:
for (i in 1:nrow(coef(lm_effect.sim))) {
abline (a = coef(lm_effect.sim)[i, 2], b = coef(lm_effect.sim)[i, 4],
lwd = .5, col = "gray")
}
# Calculating estimate and confidence interval of treatement effect
coef_interaction = coef(lm_effect.sim)[, 4]
coef_treat = coef(lm_effect.sim)[, 2]
te_degr = coef_interaction * 0 + coef_treat
te_nodegr = coef_interaction * 1 + coef_treat
cat("Average treatment effect for degree holders: ", mean(te_degr))
cat("95% CI of treatment effect for degree holders: ", quantile(te_degr, c(0.025, 0.975)))
cat("Average treatment effect for no degree holders: ", mean(te_nodegr))
cat("95% CI of treatment effect for no degree holders: ", quantile(te_nodegr, c(0.025, 0.975)))
# Random Forest used to find the best variables for predicting u78
u78 <- ifelse(lalonde$re78 > 0, 0, 1)
lalonde_df_rf = data.frame(lalonde, u78) # new data set with y78
names(lalonde_df_rf)
set.seed(1)
train = sample(1:nrow(lalonde_df_rf), nrow(lalonde_df_rf)/2)
rf.lalonde = randomForest(u78 ~ . - re78 - nodegr, data = lalonde_df_rf, mtry = 4, importance = TRUE)
yhat.rf = predict(rf.lalonde, newdata = lalonde_df_rf[-train, ])
cat("RMSE::", sqrt(mean((yhat.rf - lalonde_df_rf[-train, ]$re78)^2)))
importance(rf.lalonde)
varImpPlot(rf.lalonde)
# Question 3
# For Degree holders
u78 <- ifelse(lalonde.degr$re78 > 0, 0, 1)
lalonde.degr <- data.frame(lalonde.degr, u78)
lm_degr.u78 = glm(u78 ~ treat + re75 + u75 + black, data = lalonde.degr, family = "binomial")
summary(lm_degr.u78) #Check the p-values to find if they have significant effect
cat("Confidence interval for Degree Holders ")
confint(lm_degr.u78)
# Boot strapping to find confidence interval
logit.bootstrap <- function(data, indices) {
d <- data[indices, ]
fit <- glm(u78 ~ treat + re75 + u75 + black, data = d)
return(coef(fit))
}
set.seed(12345) # seed
lm_degr.u78.boot <- boot(data = lalonde.degr, statistic=logit.bootstrap, R=10000) # 10'000 samples
lm_degr.u78.boot
#get 95% confidence interval of all the variables
boot.ci(lm_degr.u78.boot, type="bca", index=1) # intercept
boot.ci(lm_degr.u78.boot, type="bca", index=2) # treat
boot.ci(lm_degr.u78.boot, type="bca", index=3) # re75
boot.ci(lm_degr.u78.boot, type="bca", index=4) # u75
boot.ci(lm_degr.u78.boot, type="bca", index=5) # black
# For No Degree Holders
u78 <- ifelse(lalonde.nodegr$re78 > 0, 0, 1)
lalonde.nodegr <- data.frame(lalonde.nodegr, u78)
lm_nodegr.u78 = glm(u78 ~ treat + re75 + u75 + black, data = lalonde.nodegr, family = "binomial")
summary(lm_nodegr.u78) #Check the p-values to find if they have significant effect
logit.bootstrap <- function(data, indices) {
d <- data[indices, ]
fit <- glm(u78 ~ treat + re75 + u75 + black, data = d)
return(coef(fit))
}
lm_nodegr.u78.boot <- boot(data = lalonde.nodegr, statistic=logit.bootstrap, R=10000) # 10'000 samples
lm_nodegr.u78.boot
boot.ci(lm_nodegr.u78.boot, type="bca", index=1) #intercept
boot.ci(lm_nodegr.u78.boot, type="bca", index=2) # treat
boot.ci(lm_nodegr.u78.boot, type="bca", index=3) # re75
boot.ci(lm_nodegr.u78.boot, type="bca", index=4) # u75
boot.ci(lm_nodegr.u78.boot, type="bca", index=5) # black
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment