Skip to content

Instantly share code, notes, and snippets.

@owstron
Last active March 8, 2018 18:11
Show Gist options
  • Select an option

  • Save owstron/f4bc7b777338fc2947437d154a8e2781 to your computer and use it in GitHub Desktop.

Select an option

Save owstron/f4bc7b777338fc2947437d154a8e2781 to your computer and use it in GitHub Desktop.
CS112 8.2 Statistical Matching Pre-Classwork
# know your working directory:
getwd()
####### PART 1: Get the 2 RCT data sets...
# website with data is here: http://users.nber.org/~rdehejia/data/nswdata2.html
### For the original Lalonde data
# STEP 1: Download the two text files, nsw_control.txt and nsw_treated.txt
# STEP 2: Ensure that they are in R's working directory
# STEP 3: Read them into your R workspace
nsw_controls <- read.table("nsw_control.txt")
nsw_treated <- read.table("nsw_treated.txt")
# STEP 4: Bind the treated and controls together, into one data frame
nsw_data <- rbind(nsw_treated, nsw_controls)
head(nsw_data)
additional_column_to_label_data_set <- rep(c("Original Lalonde Sample"), length(nsw_data[,1]))
nsw_data <- cbind(additional_column_to_label_data_set, nsw_data)
names(nsw_data) <- c("data_id", "treat", "age", "educ", "black", "hisp",
"married", "nodegr", "re75", "re78")
head(nsw_data)
### For Dehejia's version of the Lalonde Data
### I show you how to deal with files in STATA format below (very easy)
### Download the file: "nsw_dw.dta" and confirm it's in R's working directory
library(foreign)
DW_data <- read.dta("nsw_dw.dta")
head(DW_data)
###### PART II: CREATE THE FAKE OBSERVATIONAL DATA SETS
### Now to create the 2 simulated observational data sets that each combine the
### treatment group from the data sets above, with CPS-1 survey data
### A. ### First with the original Lalonde RCT sample
# Step 1: download "cps_controls.dta" and make sure it's in R's working directory
# Step 2: read in cps_controls.dta and confirm it has the same structure
cps_controls <- read.dta("cps_controls.dta")
head(cps_controls)
# notice that the columns of cps_controls and nsw_data are different:
# nsw_data lacks the re74 column... we have to make the columns consistent b4 rbinding them
names(nsw_data)
names(cps_controls)
cps_controls_without_re74 <- cps_controls[,-9]
names(cps_controls_without_re74) <- names(nsw_data)
# Step 3: erase the RCT experiment's control group data from the Lalonde ("nsw_data") data set
nsw_data_nocontrols <- nsw_data[-which(nsw_data$treat == 0),]
# Step 4: rbind the nsw_data_nocontrols and the cps_controls together
nsw_treated_data_with_CPS <- rbind(nsw_data_nocontrols, cps_controls_without_re74)
### B. Second with Dehejia's experimental sample, which includes re74...
### in other words, 2 years of pre-treatment earnings -- Dehejia thought it was necessary
### to control for more than 1 year of pre-treatment earnings...
# Make sure you have downloaded "cps_controls.dta" and make sure it's in R's working directory
# Read it in and check it out... (you probably did this already, above)
cps_controls <- read.dta("cps_controls.dta")
head(cps_controls)
# NEXT, make sure cps_controls has the same column names as Dehejia's experiment's data set
cps_controls_new_names <- cps_controls
names(cps_controls_new_names) <- names(DW_data)
# Step 3: erase the RCT experiment's control group data from Dehejia's ("nsw_data") data set
DW_data_nocontrols <- DW_data[-which(DW_data$treat == 0),]
# Step 4: rbind the nsw_data_nocontrols and the cps_controls together
DW_treated_data_with_CPS <- rbind(DW_data_nocontrols, cps_controls_new_names)
########## CONCLUSION
# you now have 4 data sets--
# 2 derived from RCTs,
# 2 'hybrids' with treated units from each RCT and control units from the CPS survey
#Calculate the difference in means of the outcome (_re78_) in both samples and
#run balance tests (t-tests) for key covariates.
#Are the estimated causal effects the same or similar?
diff_mean_dw = mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]$re78) - mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]$re78)
cat("Difference of Means for DW dataset", diff_mean_dw)
diff_mean_nsw = mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]$re78) - mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]$re78)
cat("Difference of Means for NSW dataset", diff_mean_nsw)
dw_treat <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]
dw_control <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]
nsw_treat <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]
nsw_control <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]
#age
t.test(dw_treat$age, dw_control$age)
t.test(nsw_treat$age, nsw_control$age)
#re75
t.test(dw_treat$re75, dw_control$re75)
t.test(nsw_treat$re75, nsw_control$re75)
#educ
t.test(dw_treat$educ, dw_control$educ)
t.test(nsw_treat$educ, nsw_control$educ)
#hisp
t.test(dw_treat$hispanic, dw_control$hispanic) #p-value 0.4746
t.test(nsw_treat$hisp, nsw_control$hisp) #p-value = 0.1946
#black
t.test(dw_treat$black, dw_control$black)
t.test(nsw_treat$black, nsw_control$black)
#married
t.test(dw_treat$married, dw_control$married)
t.test(nsw_treat$married, nsw_control$married)
#nodegr
t.test(dw_treat$nodegr, dw_control$nodegr)
t.test(nsw_treat$nodegr, nsw_control$nodegr)
#re75
t.test(dw_treat$re75, dw_control$re75)
t.test(nsw_treat$re75, nsw_control$re75)
# Only hsipanic was the key covariate with a high p-value. For DSW data set it was 0.4746
library('Matching')
nswre74_control <- read.table("nswre74_control.txt")
nswre74_treated <- read.table("nswre74_treated.txt")
cps_control <- read.table("cps_controls.txt")
nswre74_data <- rbind(nswre74_treated, nswre74_controls)
names(nswre74_treated) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
names(cps_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
names(nswre74_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
nswre74_cps_control <- rbind(nswre74_treated, cps_control)
pscore.glm<-glm(treat ~ age + I(age^2) + educ + I(educ^2) + black +
hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
family=binomial(logit), data=nswre74_cps_control)
X <- pscore.glm$fitted
Y <- nswre74_cps_control$re78
Tr <- nswre74_cps_control$treat
#Propensity Score Matching using propensity score
psm <- Match(Y=Y, Tr=Tr, X=X, M=1)
summary(psm)
#Answer p-value is 0.079366
#MatchBalance to find matches
psm.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
data=nswre74_cps_control, match.out=psm, nboots=10)
# Very bad matching using PSM Minimim p-value before and after is 2.22e-16
#Mahalanobis Matching
maha_match <- Match(Y=Y, Tr=Tr, X=X, M=1, Weight=2)
summary(maha_match)
#p-value is 0.079366
maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
data=nswre74_cps_control, match.out=psm, nboots=10)
# Very bad matching using Mahalanobis Matching Minimum p-value before and after is 2.22e-16
#Genetic Matching
D <- nswre74_cps_control$treat
Y <- nswre74_cps_control$re78
X <- cbind(nswre74_cps_control$age, I(nswre74_cps_control$age^2), nswre74_cps_control$educ,
I(nswre74_cps_control$educ^2),
nswre74_cps_control$black == 1,
nswre74_cps_control$black == 1,
nswre74_cps_control$hisp == 1,
nswre74_cps_control$hisp == 0,
nswre74_cps_control$married == 1,
nswre74_cps_control$married == 0,
nswre74_cps_control$nodegr == 1,
nswre74_cps_control$nodegr == 0,
nswre74_cps_control$re74,
I(nswre74_cps_control$re74^2), nswre74_cps_control$re75, I(nswre74_cps_control$re75^2))
genMatch_weights <- GenMatch(Tr=D, X=X, M=1, pop.size = 10)
mout <- Match(Y = Y, Tr = D, X = X, M = 1, Weight.matrix = genMatch_weights)
summary(mout)
# p.val...... 0.014411
maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
data=nswre74_cps_control, match.out=mout)
# Minimum P-value after matching is 0.018, very impressive :)
# Properties of matched data set
# Extremely good match, with minimum of 0.018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment