owstron · March 8, 2018 18:11
diff --git a/CS112-8-2-q1-t-test.R b/CS112-8-2-q1-t-test.R
 # know your working directory:
 getwd()

 ####### PART 1: Get the 2 RCT data sets...
 # website with data is here: http://users.nber.org/~rdehejia/data/nswdata2.html

 ### For the original Lalonde data
 # STEP 1: Download the two text files, nsw_control.txt and nsw_treated.txt

 # STEP 2: Ensure that they are in R's working directory

 # STEP 3: Read them into your R workspace
 nsw_controls <- read.table("nsw_control.txt")
 nsw_treated <- read.table("nsw_treated.txt")

 # STEP 4: Bind the treated and controls together, into one data frame
 nsw_data <- rbind(nsw_treated, nsw_controls)
 head(nsw_data)
 additional_column_to_label_data_set <- rep(c("Original Lalonde Sample"), length(nsw_data[,1]))
 nsw_data <- cbind(additional_column_to_label_data_set, nsw_data)
 names(nsw_data) <-  c("data_id", "treat", "age", "educ", "black", "hisp",
                      "married", "nodegr", "re75", "re78")

 head(nsw_data)

 ### For Dehejia's version of the Lalonde Data 
 ### I show you how to deal with files in STATA format below (very easy)
 ### Download the file: "nsw_dw.dta" and confirm it's in R's working directory
 library(foreign)
 DW_data <- read.dta("nsw_dw.dta")

 head(DW_data)

 ###### PART II: CREATE THE FAKE OBSERVATIONAL DATA SETS

 ### Now to create the 2 simulated observational data sets that each combine the
 ### treatment group from the data sets above, with CPS-1 survey data

 ### A. ### First with the original Lalonde RCT sample

 # Step 1: download "cps_controls.dta" and make sure it's in R's working directory

 # Step 2: read in cps_controls.dta and confirm it has the same structure
 cps_controls <- read.dta("cps_controls.dta")
 head(cps_controls)

 # notice that the columns of cps_controls and nsw_data are different:  
 # nsw_data lacks the re74 column... we have to make the columns consistent b4 rbinding them
 names(nsw_data)
 names(cps_controls)

 cps_controls_without_re74 <- cps_controls[,-9]
 names(cps_controls_without_re74) <- names(nsw_data)

 # Step 3: erase the RCT experiment's control group data from the Lalonde ("nsw_data") data set
 nsw_data_nocontrols <- nsw_data[-which(nsw_data$treat == 0),]

 # Step 4: rbind the nsw_data_nocontrols and the cps_controls together
 nsw_treated_data_with_CPS <- rbind(nsw_data_nocontrols, cps_controls_without_re74)

 ### B. Second with Dehejia's experimental sample, which includes re74... 
 ###    in other words, 2 years of pre-treatment earnings -- Dehejia thought it was necessary
 ###    to control for more than 1 year of pre-treatment earnings...

 # Make sure you have downloaded "cps_controls.dta" and make sure it's in R's working directory
 # Read it in and check it out... (you probably did this already, above)
 cps_controls <- read.dta("cps_controls.dta")
 head(cps_controls)

 # NEXT, make sure cps_controls has the same column names as Dehejia's experiment's data set
 cps_controls_new_names <- cps_controls
 names(cps_controls_new_names) <- names(DW_data)

 # Step 3: erase the RCT experiment's control group data from Dehejia's ("nsw_data") data set
 DW_data_nocontrols <- DW_data[-which(DW_data$treat == 0),]

 # Step 4: rbind the nsw_data_nocontrols and the cps_controls together
 DW_treated_data_with_CPS <- rbind(DW_data_nocontrols, cps_controls_new_names)

 ########## CONCLUSION
 # you now have 4 data sets--
 # 2 derived from RCTs, 
 # 2 'hybrids' with treated units from each RCT and control units from the CPS survey

 #Calculate the difference in means of the outcome (_re78_) in both samples and 
 #run balance tests (t-tests) for key covariates. 
 #Are the estimated causal effects the same or similar?



 diff_mean_dw = mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]$re78) - mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]$re78)
 cat("Difference of Means for DW dataset", diff_mean_dw)
 diff_mean_nsw = mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]$re78) - mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]$re78)
 cat("Difference of Means for NSW dataset", diff_mean_nsw)

 dw_treat <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]
 dw_control <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]
 nsw_treat <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]
 nsw_control <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]

 #age
 t.test(dw_treat$age, dw_control$age)
 t.test(nsw_treat$age, nsw_control$age)
 #re75
 t.test(dw_treat$re75, dw_control$re75)
 t.test(nsw_treat$re75, nsw_control$re75)
 #educ
 t.test(dw_treat$educ, dw_control$educ)
 t.test(nsw_treat$educ, nsw_control$educ)
 #hisp
 t.test(dw_treat$hispanic, dw_control$hispanic) #p-value 0.4746
 t.test(nsw_treat$hisp, nsw_control$hisp) #p-value = 0.1946
 #black
 t.test(dw_treat$black, dw_control$black)
 t.test(nsw_treat$black, nsw_control$black)
 #married
 t.test(dw_treat$married, dw_control$married)
 t.test(nsw_treat$married, nsw_control$married)
 #nodegr
 t.test(dw_treat$nodegr, dw_control$nodegr)
 t.test(nsw_treat$nodegr, nsw_control$nodegr)
 #re75
 t.test(dw_treat$re75, dw_control$re75)
 t.test(nsw_treat$re75, nsw_control$re75)

 # Only hsipanic was the key covariate with a high p-value. For DSW data set it was 0.4746


diff --git a/CS112-8-2-q2-matching.R b/CS112-8-2-q2-matching.R
 library('Matching')
 nswre74_control <- read.table("nswre74_control.txt")
 nswre74_treated <- read.table("nswre74_treated.txt")
 cps_control <- read.table("cps_controls.txt")

 nswre74_data <- rbind(nswre74_treated, nswre74_controls)

 names(nswre74_treated) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
 names(cps_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
 names(nswre74_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")

 nswre74_cps_control <- rbind(nswre74_treated, cps_control)

 pscore.glm<-glm(treat ~ age + I(age^2) + educ + I(educ^2) + black +
                  hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2), 
                family=binomial(logit), data=nswre74_cps_control)

 X <- pscore.glm$fitted
 Y <- nswre74_cps_control$re78
 Tr <- nswre74_cps_control$treat

 #Propensity Score Matching using propensity score
 psm <- Match(Y=Y, Tr=Tr, X=X, M=1)
 summary(psm)
 #Answer p-value is 0.079366

 #MatchBalance to find matches
 psm.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
                         hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2),
                        data=nswre74_cps_control, match.out=psm, nboots=10)
 # Very bad matching using PSM Minimim p-value before and after is 2.22e-16


 #Mahalanobis Matching
 maha_match <- Match(Y=Y, Tr=Tr, X=X, M=1, Weight=2)
 summary(maha_match)
 #p-value is 0.079366
 maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
                         hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2),
                       data=nswre74_cps_control, match.out=psm, nboots=10)
 # Very bad matching using Mahalanobis Matching Minimum p-value before and after is 2.22e-16


 #Genetic Matching

 D <- nswre74_cps_control$treat
 Y <- nswre74_cps_control$re78

 X <- cbind(nswre74_cps_control$age, I(nswre74_cps_control$age^2), nswre74_cps_control$educ,
           I(nswre74_cps_control$educ^2), 
           nswre74_cps_control$black == 1,
           nswre74_cps_control$black == 1,
           nswre74_cps_control$hisp == 1,
           nswre74_cps_control$hisp == 0,
           nswre74_cps_control$married == 1, 
           nswre74_cps_control$married == 0, 
           nswre74_cps_control$nodegr == 1,
           nswre74_cps_control$nodegr == 0,
           nswre74_cps_control$re74,
           I(nswre74_cps_control$re74^2), nswre74_cps_control$re75, I(nswre74_cps_control$re75^2))


 genMatch_weights <- GenMatch(Tr=D, X=X, M=1, pop.size = 10)
 mout <- Match(Y = Y, Tr = D, X = X, M = 1, Weight.matrix = genMatch_weights)

 summary(mout)
 # p.val......  0.014411 


 maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
                                hisp + married + nodegr + re74  + I(re74^2) + re75 + I(re75^2),
                              data=nswre74_cps_control, match.out=mout)
 # Minimum P-value after matching is 0.018, very impressive :)

 # Properties of matched data set
 # Extremely good match, with minimum of 0.018
	# know your working directory:
	getwd()

	####### PART 1: Get the 2 RCT data sets...
	# website with data is here: http://users.nber.org/~rdehejia/data/nswdata2.html

	### For the original Lalonde data
	# STEP 1: Download the two text files, nsw_control.txt and nsw_treated.txt

	# STEP 2: Ensure that they are in R's working directory

	# STEP 3: Read them into your R workspace
	nsw_controls <- read.table("nsw_control.txt")
	nsw_treated <- read.table("nsw_treated.txt")

	# STEP 4: Bind the treated and controls together, into one data frame
	nsw_data <- rbind(nsw_treated, nsw_controls)
	head(nsw_data)
	additional_column_to_label_data_set <- rep(c("Original Lalonde Sample"), length(nsw_data[,1]))
	nsw_data <- cbind(additional_column_to_label_data_set, nsw_data)
	names(nsw_data) <- c("data_id", "treat", "age", "educ", "black", "hisp",
	"married", "nodegr", "re75", "re78")

	head(nsw_data)

	### For Dehejia's version of the Lalonde Data
	### I show you how to deal with files in STATA format below (very easy)
	### Download the file: "nsw_dw.dta" and confirm it's in R's working directory
	library(foreign)
	DW_data <- read.dta("nsw_dw.dta")

	head(DW_data)

	###### PART II: CREATE THE FAKE OBSERVATIONAL DATA SETS

	### Now to create the 2 simulated observational data sets that each combine the
	### treatment group from the data sets above, with CPS-1 survey data

	### A. ### First with the original Lalonde RCT sample

	# Step 1: download "cps_controls.dta" and make sure it's in R's working directory

	# Step 2: read in cps_controls.dta and confirm it has the same structure
	cps_controls <- read.dta("cps_controls.dta")
	head(cps_controls)

	# notice that the columns of cps_controls and nsw_data are different:
	# nsw_data lacks the re74 column... we have to make the columns consistent b4 rbinding them
	names(nsw_data)
	names(cps_controls)

	cps_controls_without_re74 <- cps_controls[,-9]
	names(cps_controls_without_re74) <- names(nsw_data)

	# Step 3: erase the RCT experiment's control group data from the Lalonde ("nsw_data") data set
	nsw_data_nocontrols <- nsw_data[-which(nsw_data$treat == 0),]

	# Step 4: rbind the nsw_data_nocontrols and the cps_controls together
	nsw_treated_data_with_CPS <- rbind(nsw_data_nocontrols, cps_controls_without_re74)

	### B. Second with Dehejia's experimental sample, which includes re74...
	### in other words, 2 years of pre-treatment earnings -- Dehejia thought it was necessary
	### to control for more than 1 year of pre-treatment earnings...

	# Make sure you have downloaded "cps_controls.dta" and make sure it's in R's working directory
	# Read it in and check it out... (you probably did this already, above)
	cps_controls <- read.dta("cps_controls.dta")
	head(cps_controls)

	# NEXT, make sure cps_controls has the same column names as Dehejia's experiment's data set
	cps_controls_new_names <- cps_controls
	names(cps_controls_new_names) <- names(DW_data)

	# Step 3: erase the RCT experiment's control group data from Dehejia's ("nsw_data") data set
	DW_data_nocontrols <- DW_data[-which(DW_data$treat == 0),]

	# Step 4: rbind the nsw_data_nocontrols and the cps_controls together
	DW_treated_data_with_CPS <- rbind(DW_data_nocontrols, cps_controls_new_names)

	########## CONCLUSION
	# you now have 4 data sets--
	# 2 derived from RCTs,
	# 2 'hybrids' with treated units from each RCT and control units from the CPS survey

	#Calculate the difference in means of the outcome (_re78_) in both samples and
	#run balance tests (t-tests) for key covariates.
	#Are the estimated causal effects the same or similar?



	diff_mean_dw = mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]$re78) - mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]$re78)
	cat("Difference of Means for DW dataset", diff_mean_dw)
	diff_mean_nsw = mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]$re78) - mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]$re78)
	cat("Difference of Means for NSW dataset", diff_mean_nsw)

	dw_treat <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]
	dw_control <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]
	nsw_treat <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]
	nsw_control <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]

	#age
	t.test(dw_treat$age, dw_control$age)
	t.test(nsw_treat$age, nsw_control$age)
	#re75
	t.test(dw_treat$re75, dw_control$re75)
	t.test(nsw_treat$re75, nsw_control$re75)
	#educ
	t.test(dw_treat$educ, dw_control$educ)
	t.test(nsw_treat$educ, nsw_control$educ)
	#hisp
	t.test(dw_treat$hispanic, dw_control$hispanic) #p-value 0.4746
	t.test(nsw_treat$hisp, nsw_control$hisp) #p-value = 0.1946
	#black
	t.test(dw_treat$black, dw_control$black)
	t.test(nsw_treat$black, nsw_control$black)
	#married
	t.test(dw_treat$married, dw_control$married)
	t.test(nsw_treat$married, nsw_control$married)
	#nodegr
	t.test(dw_treat$nodegr, dw_control$nodegr)
	t.test(nsw_treat$nodegr, nsw_control$nodegr)
	#re75
	t.test(dw_treat$re75, dw_control$re75)
	t.test(nsw_treat$re75, nsw_control$re75)

	# Only hsipanic was the key covariate with a high p-value. For DSW data set it was 0.4746
	library('Matching')
	nswre74_control <- read.table("nswre74_control.txt")
	nswre74_treated <- read.table("nswre74_treated.txt")
	cps_control <- read.table("cps_controls.txt")

	nswre74_data <- rbind(nswre74_treated, nswre74_controls)

	names(nswre74_treated) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
	names(cps_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")
	names(nswre74_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78")

	nswre74_cps_control <- rbind(nswre74_treated, cps_control)

	pscore.glm<-glm(treat ~ age + I(age^2) + educ + I(educ^2) + black +
	hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
	family=binomial(logit), data=nswre74_cps_control)

	X <- pscore.glm$fitted
	Y <- nswre74_cps_control$re78
	Tr <- nswre74_cps_control$treat

	#Propensity Score Matching using propensity score
	psm <- Match(Y=Y, Tr=Tr, X=X, M=1)
	summary(psm)
	#Answer p-value is 0.079366

	#MatchBalance to find matches
	psm.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
	hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
	data=nswre74_cps_control, match.out=psm, nboots=10)
	# Very bad matching using PSM Minimim p-value before and after is 2.22e-16


	#Mahalanobis Matching
	maha_match <- Match(Y=Y, Tr=Tr, X=X, M=1, Weight=2)
	summary(maha_match)
	#p-value is 0.079366
	maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
	hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
	data=nswre74_cps_control, match.out=psm, nboots=10)
	# Very bad matching using Mahalanobis Matching Minimum p-value before and after is 2.22e-16


	#Genetic Matching

	D <- nswre74_cps_control$treat
	Y <- nswre74_cps_control$re78

	X <- cbind(nswre74_cps_control$age, I(nswre74_cps_control$age^2), nswre74_cps_control$educ,
	I(nswre74_cps_control$educ^2),
	nswre74_cps_control$black == 1,
	nswre74_cps_control$black == 1,
	nswre74_cps_control$hisp == 1,
	nswre74_cps_control$hisp == 0,
	nswre74_cps_control$married == 1,
	nswre74_cps_control$married == 0,
	nswre74_cps_control$nodegr == 1,
	nswre74_cps_control$nodegr == 0,
	nswre74_cps_control$re74,
	I(nswre74_cps_control$re74^2), nswre74_cps_control$re75, I(nswre74_cps_control$re75^2))


	genMatch_weights <- GenMatch(Tr=D, X=X, M=1, pop.size = 10)
	mout <- Match(Y = Y, Tr = D, X = X, M = 1, Weight.matrix = genMatch_weights)

	summary(mout)
	# p.val...... 0.014411


	maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black +
	hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2),
	data=nswre74_cps_control, match.out=mout)
	# Minimum P-value after matching is 0.018, very impressive :)

	# Properties of matched data set
	# Extremely good match, with minimum of 0.018