Last active
March 8, 2018 18:11
-
-
Save owstron/f4bc7b777338fc2947437d154a8e2781 to your computer and use it in GitHub Desktop.
CS112 8.2 Statistical Matching Pre-Classwork
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # know your working directory: | |
| getwd() | |
| ####### PART 1: Get the 2 RCT data sets... | |
| # website with data is here: http://users.nber.org/~rdehejia/data/nswdata2.html | |
| ### For the original Lalonde data | |
| # STEP 1: Download the two text files, nsw_control.txt and nsw_treated.txt | |
| # STEP 2: Ensure that they are in R's working directory | |
| # STEP 3: Read them into your R workspace | |
| nsw_controls <- read.table("nsw_control.txt") | |
| nsw_treated <- read.table("nsw_treated.txt") | |
| # STEP 4: Bind the treated and controls together, into one data frame | |
| nsw_data <- rbind(nsw_treated, nsw_controls) | |
| head(nsw_data) | |
| additional_column_to_label_data_set <- rep(c("Original Lalonde Sample"), length(nsw_data[,1])) | |
| nsw_data <- cbind(additional_column_to_label_data_set, nsw_data) | |
| names(nsw_data) <- c("data_id", "treat", "age", "educ", "black", "hisp", | |
| "married", "nodegr", "re75", "re78") | |
| head(nsw_data) | |
| ### For Dehejia's version of the Lalonde Data | |
| ### I show you how to deal with files in STATA format below (very easy) | |
| ### Download the file: "nsw_dw.dta" and confirm it's in R's working directory | |
| library(foreign) | |
| DW_data <- read.dta("nsw_dw.dta") | |
| head(DW_data) | |
| ###### PART II: CREATE THE FAKE OBSERVATIONAL DATA SETS | |
| ### Now to create the 2 simulated observational data sets that each combine the | |
| ### treatment group from the data sets above, with CPS-1 survey data | |
| ### A. ### First with the original Lalonde RCT sample | |
| # Step 1: download "cps_controls.dta" and make sure it's in R's working directory | |
| # Step 2: read in cps_controls.dta and confirm it has the same structure | |
| cps_controls <- read.dta("cps_controls.dta") | |
| head(cps_controls) | |
| # notice that the columns of cps_controls and nsw_data are different: | |
| # nsw_data lacks the re74 column... we have to make the columns consistent b4 rbinding them | |
| names(nsw_data) | |
| names(cps_controls) | |
| cps_controls_without_re74 <- cps_controls[,-9] | |
| names(cps_controls_without_re74) <- names(nsw_data) | |
| # Step 3: erase the RCT experiment's control group data from the Lalonde ("nsw_data") data set | |
| nsw_data_nocontrols <- nsw_data[-which(nsw_data$treat == 0),] | |
| # Step 4: rbind the nsw_data_nocontrols and the cps_controls together | |
| nsw_treated_data_with_CPS <- rbind(nsw_data_nocontrols, cps_controls_without_re74) | |
| ### B. Second with Dehejia's experimental sample, which includes re74... | |
| ### in other words, 2 years of pre-treatment earnings -- Dehejia thought it was necessary | |
| ### to control for more than 1 year of pre-treatment earnings... | |
| # Make sure you have downloaded "cps_controls.dta" and make sure it's in R's working directory | |
| # Read it in and check it out... (you probably did this already, above) | |
| cps_controls <- read.dta("cps_controls.dta") | |
| head(cps_controls) | |
| # NEXT, make sure cps_controls has the same column names as Dehejia's experiment's data set | |
| cps_controls_new_names <- cps_controls | |
| names(cps_controls_new_names) <- names(DW_data) | |
| # Step 3: erase the RCT experiment's control group data from Dehejia's ("nsw_data") data set | |
| DW_data_nocontrols <- DW_data[-which(DW_data$treat == 0),] | |
| # Step 4: rbind the nsw_data_nocontrols and the cps_controls together | |
| DW_treated_data_with_CPS <- rbind(DW_data_nocontrols, cps_controls_new_names) | |
| ########## CONCLUSION | |
| # you now have 4 data sets-- | |
| # 2 derived from RCTs, | |
| # 2 'hybrids' with treated units from each RCT and control units from the CPS survey | |
| #Calculate the difference in means of the outcome (_re78_) in both samples and | |
| #run balance tests (t-tests) for key covariates. | |
| #Are the estimated causal effects the same or similar? | |
| diff_mean_dw = mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),]$re78) - mean(DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),]$re78) | |
| cat("Difference of Means for DW dataset", diff_mean_dw) | |
| diff_mean_nsw = mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),]$re78) - mean(nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),]$re78) | |
| cat("Difference of Means for NSW dataset", diff_mean_nsw) | |
| dw_treat <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 1),] | |
| dw_control <- DW_treated_data_with_CPS[which(DW_treated_data_with_CPS$treat == 0),] | |
| nsw_treat <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 1),] | |
| nsw_control <- nsw_treated_data_with_CPS[which(nsw_treated_data_with_CPS$treat == 0),] | |
| #age | |
| t.test(dw_treat$age, dw_control$age) | |
| t.test(nsw_treat$age, nsw_control$age) | |
| #re75 | |
| t.test(dw_treat$re75, dw_control$re75) | |
| t.test(nsw_treat$re75, nsw_control$re75) | |
| #educ | |
| t.test(dw_treat$educ, dw_control$educ) | |
| t.test(nsw_treat$educ, nsw_control$educ) | |
| #hisp | |
| t.test(dw_treat$hispanic, dw_control$hispanic) #p-value 0.4746 | |
| t.test(nsw_treat$hisp, nsw_control$hisp) #p-value = 0.1946 | |
| #black | |
| t.test(dw_treat$black, dw_control$black) | |
| t.test(nsw_treat$black, nsw_control$black) | |
| #married | |
| t.test(dw_treat$married, dw_control$married) | |
| t.test(nsw_treat$married, nsw_control$married) | |
| #nodegr | |
| t.test(dw_treat$nodegr, dw_control$nodegr) | |
| t.test(nsw_treat$nodegr, nsw_control$nodegr) | |
| #re75 | |
| t.test(dw_treat$re75, dw_control$re75) | |
| t.test(nsw_treat$re75, nsw_control$re75) | |
| # Only hsipanic was the key covariate with a high p-value. For DSW data set it was 0.4746 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library('Matching') | |
| nswre74_control <- read.table("nswre74_control.txt") | |
| nswre74_treated <- read.table("nswre74_treated.txt") | |
| cps_control <- read.table("cps_controls.txt") | |
| nswre74_data <- rbind(nswre74_treated, nswre74_controls) | |
| names(nswre74_treated) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78") | |
| names(cps_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78") | |
| names(nswre74_control) <- c("treat", "age", "educ", "black", "hisp", "married", "nodegr", "re74", "re75", "re78") | |
| nswre74_cps_control <- rbind(nswre74_treated, cps_control) | |
| pscore.glm<-glm(treat ~ age + I(age^2) + educ + I(educ^2) + black + | |
| hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2), | |
| family=binomial(logit), data=nswre74_cps_control) | |
| X <- pscore.glm$fitted | |
| Y <- nswre74_cps_control$re78 | |
| Tr <- nswre74_cps_control$treat | |
| #Propensity Score Matching using propensity score | |
| psm <- Match(Y=Y, Tr=Tr, X=X, M=1) | |
| summary(psm) | |
| #Answer p-value is 0.079366 | |
| #MatchBalance to find matches | |
| psm.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black + | |
| hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2), | |
| data=nswre74_cps_control, match.out=psm, nboots=10) | |
| # Very bad matching using PSM Minimim p-value before and after is 2.22e-16 | |
| #Mahalanobis Matching | |
| maha_match <- Match(Y=Y, Tr=Tr, X=X, M=1, Weight=2) | |
| summary(maha_match) | |
| #p-value is 0.079366 | |
| maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black + | |
| hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2), | |
| data=nswre74_cps_control, match.out=psm, nboots=10) | |
| # Very bad matching using Mahalanobis Matching Minimum p-value before and after is 2.22e-16 | |
| #Genetic Matching | |
| D <- nswre74_cps_control$treat | |
| Y <- nswre74_cps_control$re78 | |
| X <- cbind(nswre74_cps_control$age, I(nswre74_cps_control$age^2), nswre74_cps_control$educ, | |
| I(nswre74_cps_control$educ^2), | |
| nswre74_cps_control$black == 1, | |
| nswre74_cps_control$black == 1, | |
| nswre74_cps_control$hisp == 1, | |
| nswre74_cps_control$hisp == 0, | |
| nswre74_cps_control$married == 1, | |
| nswre74_cps_control$married == 0, | |
| nswre74_cps_control$nodegr == 1, | |
| nswre74_cps_control$nodegr == 0, | |
| nswre74_cps_control$re74, | |
| I(nswre74_cps_control$re74^2), nswre74_cps_control$re75, I(nswre74_cps_control$re75^2)) | |
| genMatch_weights <- GenMatch(Tr=D, X=X, M=1, pop.size = 10) | |
| mout <- Match(Y = Y, Tr = D, X = X, M = 1, Weight.matrix = genMatch_weights) | |
| summary(mout) | |
| # p.val...... 0.014411 | |
| maha_match.mb <- MatchBalance(treat ~ age + I(age^2) + educ + I(educ^2) + black + | |
| hisp + married + nodegr + re74 + I(re74^2) + re75 + I(re75^2), | |
| data=nswre74_cps_control, match.out=mout) | |
| # Minimum P-value after matching is 0.018, very impressive :) | |
| # Properties of matched data set | |
| # Extremely good match, with minimum of 0.018 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment