This Notebook replicates the results of Section 8.2 using the publicly available data that can be downloaded here. The notebook should help to …
show the underlying code for researchers interested in replicating the analysis or in adapting it to new datasets
show that we get similar results if we only use random forests instead of the ensemble method
The explanations throughout the notebook are kept short. Please see the paper for more details regarding the decomposition parameters and their estimation.
Click on the “Code” button on the top right of this notebook to “Hide All Code” and see only the results or “Download Rmd” to extract the underlying code.
Running the analyses in this notebook took roughly two hours on a SWITCHengine with eight cores and 32 GB RAM. Running it with the full emsemble took three days. To replicate the ensemble uncomment the lines 172 to 175 in the .RmD file.
Download the impact.sas7bdat, baseline.sas7bdat, key_vars.sas7bdat, and mileston.sas7bdat files from OPENICPSR and save them in the same folder as the notebook.
# Load required libraries
library(causalDML)
library(sas7bdat)
library(tidyverse)
# Set seed
seed = 1234
set.seed(seed)
Now prepare outcome, effective treatment, and confounders as well as the heterogeneity variable gender:
# Download dataset from https://www.openicpsr.org/openicpsr/project/113269/version/V1/view
# into working directory and load the raw data (takes some time)
impact_raw = read.sas7bdat("impact.sas7bdat")
baseline_raw = read.sas7bdat("baseline.sas7bdat")
key_vars_raw = read.sas7bdat("key_vars.sas7bdat")
mileston_raw = read.sas7bdat("mileston.sas7bdat")
# Extract the relevant variables
impact = select(impact_raw,MPRID,EARNY4,EVERJCH)
impact_temp = select(impact_raw,MPRID,JCVCLERC,JCVHLTH,JCVAUTO,JCVWELD,JCVELTRC,JCVCSTRC,JCVFOOD,JCVETRNC,JCVOTH)
baseline = select(baseline_raw,MPRID,RACE_ETH,HS_D,GED_D,VOC_D,ANY_ED1,HGC,NTV_LANG,WKEARNR,
HASCHLD,MARRIAGE,R_HEAD,HHMEMB,HEALTH,PY_CIG,PY_ALCHL,
PY_POT,HADWORRY,HEAR_JC,KNEWCNTR,E_MATH,E_READ,E_ALONG,
E_CONTRL,E_ESTEEM,E_SPCJOB,E_FRIEND,KNEW_JC,EARN_YR)
key_vars = select(key_vars_raw,MPRID,TREATMNT,FEMALE,AGE_CAT,EDUC_GR,NONRES)
mileston = select(mileston_raw,MPRID,LIVESPOU,EVERWORK,YR_WORK,CURRJOB,
JOB0_3, JOB3_9, JOB9_12, MOSTWELF,GOT_AFDC,GOT_FS,ED0_6,ED6_12,
PUBLICH,BADHLTH,HARDUSE,POTUSE,EVARRST,PMSA,MSA,PRARRI)
# Code the versions
versions = rep("Nothing",nrow(impact))
versions[impact_temp[,2] == 1] = "Clerical"
versions[impact_temp[,3] == 1] = "Health"
versions[impact_temp[,4] == 1] = "Auto"
versions[impact_temp[,5] == 1] = "Welding"
versions[impact_temp[,6] == 1 | impact_temp[,9] == 1] = "Electrical"
versions[impact_temp[,7] == 1] = "Construction"
versions[impact_temp[,8] == 1] = "Food"
versions[impact_temp[,10] == 1] = "Other"
versions[rowSums(impact_temp[,2:10],na.rm=T) > 1] = "Multiple"
impact = cbind(impact,versions)
# Merge the data
inner_join(impact,baseline,by="MPRID") %>% inner_join(key_vars,by="MPRID") %>% inner_join(mileston,by="MPRID") %>%
mutate(RACE_W = as.numeric(RACE_ETH == 1)) %>% mutate(RACE_B = as.numeric(RACE_ETH == 2)) %>%
mutate(RACE_H = as.numeric(RACE_ETH == 3)) %>% mutate(RACE_O = as.numeric(RACE_ETH == 4)) %>%
mutate(WKEARNR = replace(WKEARNR,is.na(WKEARNR),0)) %>%
mutate(EARN_YR = replace(EARN_YR,is.na(EARN_YR),0)) -> db
# Remove observations with missing values
db %>% na.omit() -> db
# Create main variables
Y = as.matrix(select(db,EARNY4))
D = as.matrix(select(db,TREATMNT))
# Heterogeneity variable
female = select(db,FEMALE)
fem_mat = cbind(1-female,female)
label_fem = c("Male","Female")
colnames(fem_mat) = label_fem
# Create versions
JC = db$EVERJCH
T = rep(NA,length(Y))
T[D == 0] = "Control"
T[D == 1 & JC == 0] = "No JC"
T[D == 1 & JC == 1 & db$versions == "Clerical"] = "Clerical"
T[D == 1 & JC == 1 & db$versions == "Health"] = "Health"
T[D == 1 & JC == 1 & db$versions == "Auto"] = "Auto"
T[D == 1 & JC == 1 & db$versions == "Welding"] = "Welding"
T[D == 1 & JC == 1 & db$versions == "Electrical"] = "Electrical"
T[D == 1 & JC == 1 & db$versions == "Construction"] = "Construction"
T[D == 1 & JC == 1 & db$versions == "Food"] = "Food"
T[D == 1 & JC == 1 & db$versions == "Other"] = "Other"
T[D == 1 & JC == 1 & db$versions == "Multiple"] = "Multiple"
T[is.na(T)] = "JC without voc"
label_T = c("Control","No JC","JC without voc","Clerical","Health","Auto","Welding","Electrical",
"Construction","Food","Other","Multiple")
T = factor(T,levels=label_T)
# Create control matrix
x_main = as.matrix(select(db,FEMALE,AGE_CAT,RACE_W,RACE_B,RACE_H,RACE_O,EDUC_GR,
LIVESPOU,EVERWORK,YR_WORK,CURRJOB,JOB0_3, JOB3_9, JOB9_12,
MOSTWELF,GOT_AFDC,GOT_FS,ED0_6,ED6_12,PUBLICH,BADHLTH,HARDUSE,POTUSE,EVARRST,PMSA,MSA,
HS_D,GED_D,VOC_D,ANY_ED1,HGC,NTV_LANG,WKEARNR,
HASCHLD,MARRIAGE,R_HEAD,HHMEMB,HEALTH,PY_CIG,PY_ALCHL,
PY_POT,HADWORRY,HEAR_JC,KNEWCNTR,E_MATH,E_READ,E_ALONG,
E_CONTRL,E_ESTEEM,E_SPCJOB,E_FRIEND,KNEW_JC,NONRES,PRARRI,EARN_YR))
X = design_matrix(x_main,int=colnames(x_main),int_d=2)
X = data_screen(X,print=F)
# Boolean to extract main effects
main_effects = !grepl(":",colnames(X))
The implementation is based on the causalDML package that estimates the average potential outcomes and average treatment effects for the multivalued effective treatment. The nuisance parameters are estimated using an ensemble of methods, which first needs to be initialized.
## Initialize components to be used in the ensemble learner
# General component
mean = create_method("mean",name="Mean")
forest = create_method("forest_grf",x_select = main_effects,name="Forest",
args=list(honesty = F,tune.parameters = "all",seed=seed))
# Pscore specific components
ridge_bin_low = create_method("ridge",x_select = main_effects,name="Ridge low",
args=list(family = "binomial",lambda.min.ratio=0.001))
lasso_bin_low = create_method("lasso",x_select = main_effects,name="Lasso low",
args=list(family = "binomial",lambda.min.ratio=0.01))
ridge_bin_high = create_method("ridge",name="Ridge high",
args=list(family = "binomial",lambda.min.ratio=0.01))
lasso_bin_high = create_method("lasso",name="Lasso high",
args=list(family = "binomial",lambda.min.ratio=0.05))
# Outcome specific components
ridge_ls_low = create_method("ridge",x_select = main_effects,name="Ridge low",
args=list(lambda.min.ratio=0.0001))
lasso_ls_low = create_method("lasso",x_select = main_effects,name="Lasso low",
args=list(lambda.min.ratio=0.001))
ridge_ls_high = create_method("ridge",name="Ridge high",
args=list(lambda.min.ratio=0.05))
lasso_ls_high = create_method("lasso",name="Lasso high",
args=list(lambda.min.ratio=0.05))
# Run multiple treatment model to get scores
ate = causalDML(Y,T,X,
ml_w=list(forest),
ml_y=list(forest),quiet=F)
# Uncomment this to replicate the results with ensemble of methods
# ate = causalDML(Y,T,X,
# ml_w=list(mean,forest,ridge_bin_low,lasso_bin_low,
# ridge_bin_high,lasso_bin_high),
# ml_y=list(mean,forest,ridge_ls_low,lasso_ls_low,
# ridge_ls_high,lasso_ls_high))
# Show results for each effective treatment
plot(ate$APO)
summary(ate$APO)
APO SE
Control 195.5294 2.935893
No JC 199.1164 5.213854
JC ithout voc 189.3148 6.003815
Clerical 232.9166 14.655847
Health 202.2124 12.753642
Auto 259.0107 22.461835
Welding 239.0742 17.897670
Electrical 241.7421 24.288266
Construction 236.5801 11.709246
Food 200.3551 11.762820
Other 216.3865 7.908796
Multiple 234.2120 8.767559
The decomposition reuses the nuisance parameters and doubly robust scores that are stored in the object created by the causalDML function.
Now we replicate the results of Figure 5 of the paper
## Decomposition for each subgroup
# Regression without constant
cat("Without constant:\n")
Without constant:
decomp_female_woc = HK_decomposition(ate$APO,fem_mat,intercept=FALSE)
summary(decomp_female_woc)
nATE:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
zMale 18.2468 5.3624 3.4028 0.0006698 ***
zFemale 10.6330 5.2989 2.0066 0.0448161 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
rATE:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
zMale 18.9297 5.9604 3.1759 0.001498 **
zFemale 16.0903 5.6883 2.8287 0.004684 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Delta:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
zMale -0.68284 2.61117 -0.2615 0.79371
zFemale -5.45726 2.17331 -2.5110 0.01205 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# Regression with constant
cat("\n\nWith constant:\n")
With constant:
decomp_female_wc = HK_decomposition(ate$APO,female,intercept=TRUE)
summary(decomp_female_wc)
nATE:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.2468 5.3624 3.4028 0.0006698 ***
z -7.6138 7.5388 -1.0100 0.3125420
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
rATE:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.9297 5.9604 3.1759 0.001498 **
z -2.8394 8.2391 -0.3446 0.730381
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Delta:
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.68284 2.61117 -0.2615 0.7937
z -4.77442 3.39728 -1.4054 0.1599
and the Figure itself:
# The following is probably the most complicated way to create the graph, but it works
# Extract coefficients
coef_fem = c(decomp_female_woc$nATE$results[1,1],
decomp_female_woc$Delta$results[1,1],
decomp_female_woc$rATE$results[1,1],
decomp_female_woc$nATE$results[2,1],
decomp_female_woc$Delta$results[2,1],
decomp_female_woc$rATE$results[2,1],
decomp_female_wc$nATE$results[2,1],
decomp_female_wc$Delta$results[2,1],
decomp_female_wc$rATE$results[2,1])
# Extract p-values
pv_fem = c(decomp_female_woc$nATE$results[1,4],
decomp_female_woc$Delta$results[1,4],
decomp_female_woc$rATE$results[1,4],
decomp_female_woc$nATE$results[2,4],
decomp_female_woc$Delta$results[2,4],
decomp_female_woc$rATE$results[2,4],
decomp_female_wc$nATE$results[2,4],
decomp_female_wc$Delta$results[2,4],
decomp_female_wc$rATE$results[2,4])
# Start and end points for waterfall graph
start0 = c(0,coef_fem[1],coef_fem[3])
end0 = c(coef_fem[1],coef_fem[3],0)
start1 = c(0,coef_fem[4],coef_fem[6])
end1 = c(coef_fem[4],coef_fem[6],0)
start2 = c(0,coef_fem[7],coef_fem[9])
end2 = c(coef_fem[7],coef_fem[9],0)
# Format results
coef_fem = format(coef_fem,digits=2)
pv_fem = paste0("(",format(pv_fem,digits=0,nsmall=3,scientific=F),")")
data.frame(id=c(1:3,1:3,1:3),
Estimand=factor(c("nATE","Delta","rATE","nATE","Delta","rATE","nATE","Delta","rATE"),levels = c("nATE","Delta","rATE")),
Gender = factor(c(rep("Male",3),rep("Female",3),rep("Difference Female - Male",3)),
levels = c("Female","Male","Difference Female - Male")),
start= c(start0,start1,start2),
end=c(end0,end1,end2),
coef = coef_fem,
pvalue = pv_fem) %>%
ggplot(aes(id, fill = Estimand,group=Gender)) +
geom_rect(aes(xmin = id - 0.5, xmax = id + 0.5, ymin = end,ymax = start)) +
scale_fill_brewer(palette="Dark2") + ylab("Effect") +
scale_x_continuous(breaks=c(1,2,3),labels=c("nATE", expression(Delta),"rATE")) +
theme_bw() + geom_hline(yintercept = 0) +
xlab("Estimand") + theme(legend.position="none") +
geom_text(aes(x = id, y = (start+end+1.25)/2, label = coef),size = 2.75) +
geom_text(aes(x = id, y = (start+end-1.25)/2, label = pvalue),size = 2.75) +
facet_wrap(~Gender)