
Theory in action
Michael C. Knaus
05/25
Source:vignettes/Theory_in_action_outcome_weights.Rmd
Theory_in_action_outcome_weights.Rmd
Setting the stage
Managing expectations
This notebook illustrates the theoretical derivations of Knaus (2024). We keep everything as simple as possible meaning that
we use the cleaned and readily available 401(k) data of the
hdm
package, which includes a treatment and an instrumentwe use the
regression_forest()
of thegrf
to estimate nuisance parameters because this is the only ML method with a build-in function to extract smoother weightswe hand-code two-fold cross-validation with
regression_forest()
and use the same nuisance parameters for different estimatorswe do no tune the hyperparameters and stick to the default setting
we only consider causal/instrumental forest effect estimates for one unit and not for all 9915
we stick as close as possible to the paper notation
However, the code provides the basis to extend the analysis in different directions. To this end, click on the “Code” button at the top right and “Download Rmd”.
This notebooks is only for illustration. The OutcomeWeights
R package implements everything in a more user friendly way.
Load packages and data
First, load the relevant packages
if (!require("cobalt")) install.packages("cobalt", dependencies = TRUE); library(cobalt)
if (!require("grf")) install.packages("grf", dependencies = TRUE); library(grf)
if (!require("hdm")) install.packages("hdm", dependencies = TRUE); library(hdm)
if (!require("gridExtra")) install.packages("gridExtra", dependencies = TRUE); library(gridExtra)
if (!require("tidyverse")) install.packages("tidyverse", dependencies = TRUE); library(tidyverse)
if (!require("AER")) install.packages("AER", dependencies = TRUE); library(AER)
and the data
data(pension) # Find variable description if you type ?pension in console
# Treatment
D = pension$p401
# Instrument
Z = pension$e401
# Outcome
Y = pension$net_tfa
# Controls
X = model.matrix(~ 0 + age + db + educ + fsize + hown + inc + male + marr + pira + twoearn, data = pension)
colnames(X) = c("Age","Benefit pension","Education","Family size","Home owner","Income","Male","Married","IRA","Two earners")
Define useful quantities and set the seed:
One function to calculate them all
Proposition 1 in the paper shows that the outcome weights vector can be obtained in the general form \boldsymbol{\omega'} = (\boldsymbol{\tilde{Z}'\tilde{D}})^{-1} \boldsymbol{\tilde{Z}'T}
where \boldsymbol{\tilde{Z}}, \boldsymbol{\tilde{D}} and \boldsymbol{T} are pseudo-instrument, pseudo-treatment and the transformation matrix, respectively.
This motivates the definition of the
weight_maker(Dtilde,Ztilde,T_mat)
function taking the three
generic inputs and producing the outcome weights vector:
weight_maker = function(Dtilde,Ztilde,T_mat) {
omega = (t(Ztilde) %*% Dtilde)^(-1) %*% t(Ztilde) %*% T_mat
return(omega)
}
In the following, we just need to define the three components for the respective estimators and plug them into this generic function to obtain the outcome weights.
Estimate all nuisance parameters and get outcome smoother matrices
Equation (8) in the paper shows which nuisance parameters are required for the estimators under consideration. Here, we estimate them all using honest random forest and 2-fold cross-fitting. Additionally, we extract the relevant smoother matrices \boldsymbol{S}, \boldsymbol{S^d_0}, \boldsymbol{S^d_1}, \boldsymbol{S^z_0}, and \boldsymbol{S^z_1}.
# Get fold number
fold = sample(1:nfolds,N,replace=T)
# Initialize nuisance parameters
Yhat = Yhatd0 = Yhatd1 = Yhatz0 = Yhatz1 =
Dhat = Dhatz0 = Dhatz1 = Zhat = rep(NA,N)
# Initialize smoother matrices
S = Sd0 = Sd1 = Sz0 = Sz1 = matrix(0,N,N)
for (i in 1:nfolds){
# Calculate nuisance parameters and relevant smoother matrices
rf_Yhat = regression_forest(X[fold != i,],Y[fold != i])
Yhat[fold == i] = predict(rf_Yhat, X[fold == i,])$predictions
S[fold == i, fold != i] = as.matrix(get_forest_weights(rf_Yhat, X[fold == i,]))
rf_Yhatd0 = regression_forest(X[fold != i & D==0,],Y[fold != i & D==0])
Yhatd0[fold == i] = predict(rf_Yhatd0, X[fold == i,])$predictions
Sd0[fold == i, fold != i & D==0] = as.matrix(get_forest_weights(rf_Yhatd0, X[fold == i,]))
rf_Yhatd1 = regression_forest(X[fold != i & D==1,],Y[fold != i & D==1])
Yhatd1[fold == i] = predict(rf_Yhatd1, X[fold == i,])$predictions
Sd1[fold == i, fold != i & D==1] = as.matrix(get_forest_weights(rf_Yhatd1, X[fold == i,]))
rf_Yhatz0 = regression_forest(X[fold != i & Z==0,],Y[fold != i & Z==0])
Yhatz0[fold == i] = predict(rf_Yhatz0, X[fold == i,])$predictions
Sz0[fold == i, fold != i & Z==0] = as.matrix(get_forest_weights(rf_Yhatz0, X[fold == i,]))
rf_Yhatz1 = regression_forest(X[fold != i & Z==1,],Y[fold != i & Z==1])
Yhatz1[fold == i] = predict(rf_Yhatz1, X[fold == i,])$predictions
Sz1[fold == i, fold != i & Z==1] = as.matrix(get_forest_weights(rf_Yhatz1, X[fold == i,]))
rf_Dhat = regression_forest(X[fold != i,],D[fold != i])
Dhat[fold == i] = predict(rf_Dhat, X[fold == i,])$predictions
rf_Dhatz0 = regression_forest(X[fold != i & Z==0,],D[fold != i & Z==0])
Dhatz0[fold == i] = predict(rf_Dhatz0, X[fold == i,])$predictions
rf_Dhatz1 = regression_forest(X[fold != i & Z==1,],D[fold != i & Z==1])
Dhatz1[fold == i] = predict(rf_Dhatz1, X[fold == i,])$predictions
rf_Zhat = regression_forest(X[fold != i,],Z[fold != i])
Zhat[fold == i] = predict(rf_Zhat, X[fold == i,])$predictions
}
For those not familiar with smoothers, let’s observe that they
produce exactly the same predictions as the original
predict()
functions:
cat("Smoother matrices replicate predicted nuisance parameter?",
all(
all.equal(as.numeric(S %*% Y), Yhat) == TRUE,
all.equal(as.numeric(Sd0 %*% Y), Yhatd0) == TRUE,
all.equal(as.numeric(Sd1 %*% Y), Yhatd1) == TRUE,
all.equal(as.numeric(Sz0 %*% Y), Yhatz0) == TRUE,
all.equal(as.numeric(Sz1 %*% Y), Yhatz1) == TRUE)
)
## Smoother matrices replicate predicted nuisance parameter? TRUE
Also observe that random forest is an affine smoother because all rows add up exactly to one:
cat("Affine smoother matrices?",
all(
all.equal(as.numeric(ones),as.numeric(rowSums(S))) == TRUE,
all.equal(as.numeric(ones),as.numeric(rowSums(Sd0))) == TRUE,
all.equal(as.numeric(ones),as.numeric(rowSums(Sd1))) == TRUE,
all.equal(as.numeric(ones),as.numeric(rowSums(Sz0))) == TRUE,
all.equal(as.numeric(ones),as.numeric(rowSums(Sz1))) == TRUE)
)
## Affine smoother matrices? TRUE
Finally, define the different IPW weights for later use:
lambda1 = D / Dhat
lambda0 = (1-D) / (1-Dhat)
lambdaz1 = Z / Zhat
lambdaz0 = (1-Z) / (1-Zhat)
Outcome weights of DML and GRF
Instrumental forest
To be replicated
Here, we want to replicate the CLATE estimate for unit 1. To this
end, we first run instrumental_forest()
with the
pre-defined nuisance parameters:
# Run IF with the pre-specified nuisance parameters
ivf = instrumental_forest(X,Y,D,Z,Y.hat=Yhat,W.hat=Dhat,Z.hat=Zhat)
# Get CLATEs
clates_if = predict(ivf)$predictions
The estimated CLATE for unit 1 is then 6421.6180875, which becomes the number to be replicated:
to_be_rep_if = clates_if[unit]
to_be_rep_if
## [1] 6421.618
Nearly replication
According to the main text of the paper, we need the following components \boldsymbol{\hat{R}} = \boldsymbol{Z} - \boldsymbol{\hat{Z}}, \quad \boldsymbol{\hat{V}} = \boldsymbol{D} - \boldsymbol{\hat{D}}, \quad \boldsymbol{\hat{U}} = \boldsymbol{Y} - \boldsymbol{\hat{Y}}, \quad \alpha^{if}(\boldsymbol{x}) that are defined in the following
Uhat = Y - Yhat
Vhat = D - Dhat
Rhat = Z - Zhat
alpha_if = get_forest_weights(ivf)[unit,]
Now define the pseudo-variables and the transformation matrix:
Ztilde_if = Rhat * alpha_if
Dtilde_if = Vhat
Ytilde_if = Uhat
T_if = diag(N) - S
As a sanity check observe how the transformation matrix replicates the pseudo-outcome:
cat("TY replicates pseudo-outcome?",
all.equal(as.numeric( T_if %*% Y ), Uhat))
## TY replicates pseudo-outcome? TRUE
Now we pass the required components to the
weight_maker()
function
omega_if = weight_maker(Dtilde_if, Ztilde_if, T_if)
repl_wo_const = omega_if %*% Y
This estimates an effect of 6405.4799565, which is very close but not identical to the number to be replicated:
all.equal(as.numeric( repl_wo_const ), to_be_rep_if)
## [1] "Mean relative difference: 0.002519426"
Exact replication
As noted in Appendix A.3.1, the grf package applies an additional constant which requires to adjust the pseudo-instrument:
# Create weighted residual maker matrix
M1_inv = solve(t(ones) %*% diag(alpha_if) %*% ones)
P1 = (ones %*% M1_inv) %*% (t(ones) %*% diag(alpha_if))
M1 = diag(N) - P1
# Get modified pseudo-instrument
Ztilde_if = (M1 %*% Rhat) * alpha_if
Plugging this new pseudo-instrument into the
weight_maker()
function creates outcome weights that now
perfectly replicate the package output:
# Get if outcome weights
omega_if = weight_maker(Dtilde_if, Ztilde_if, T_if)
cat("ω'Y replicates package output?",all.equal(as.numeric( omega_if %*% Y ), to_be_rep_if))
## ω'Y replicates package output? TRUE
Partially linear IV regression
To obtain the standard partially linear IV regression, we first solve the canonical moment condition to see which value should be replicated:
## [1] 12570.88
Compared to IF, we only need to pass a different pseudo-instrument to
the weight_maker
to replicate this number:
# Modified pseudo-instrument
Ztilde_plriv = Rhat * ones
# Get outcome weights
omega_plriv = weight_maker(Dtilde_if, Ztilde_plriv, T_if)
cat("ω'Y replicates moment based implentation?",all.equal(as.numeric( omega_plriv %*% Y ), to_be_rep_plriv))
## ω'Y replicates moment based implentation? TRUE
Causal forest
To be replicated
Here, we want to replicate the CATE estimate for unit 1. To this end,
we first run the causal_forest()
with the pre-defined
nuisance parameters:
cf = causal_forest(X,Y,D,Y.hat=Yhat,W.hat=Dhat)
cates_cf = predict(cf)$predictions
The estimated CATE for unit 1 is then 5588.963803, which becomes the number to be replicated:
to_be_rep_cf = cates_cf[unit]
Exact replication
Causal forest works similar to instrumental forest but requires weight \alpha^{cf}(\boldsymbol{x}) to enter the pseudo-instrument:
alpha_cf = get_forest_weights(cf)[unit,]
M1_inv = solve(t(ones) %*% diag(alpha_cf) %*% ones)
P1 = (ones %*% M1_inv) %*% (t(ones) %*% diag(alpha_cf))
M1 = diag(N) - P1
# Get pseudo-instrument
Ztilde_cf = (M1 %*% Vhat) * alpha_cf
Plugging this new pseudo-instrument into the
weight_maker()
function again perfectly replicates the
package output:
# Get cf outcome weights
omega_cf = weight_maker(Dtilde_if, Ztilde_cf, T_if)
cat("ω'Y replicates package output?",
all.equal(as.numeric( omega_cf %*% Y ), to_be_rep_cf))
## ω'Y replicates package output? TRUE
Partially linear regression
To obtain the standard partially linear regression, we first solve the canonical moment condition to see which value should be replicated:
## [1] 13576.51
Compared to CF, we only need to pass a different pseudo-instrument to
the weight_maker()
function to replicate this number:
# Modified pseudo-instrument
Ztilde_plr = Vhat * ones
# Get outcome weights
omega_plr = weight_maker(Dtilde_if, Ztilde_plr, T_if)
cat("ω'Y replicates moment based implementation?",
all.equal(as.numeric( omega_plr %*% Y ), to_be_rep_plr))
## ω'Y replicates moment based implementation? TRUE
Augmented inverse probability weighting
The AIPW uses a large pseudo-outcome and takes the mean of it:
Ytilde_aipw = Yhatd1 - Yhatd0 + lambda1 * (Y - Yhatd1) - lambda0 * (Y- Yhatd0)
to_be_rep_aipw = mean(Ytilde_aipw)
to_be_rep_aipw
## [1] 11521.92
To replicate this number with outcome weights, we need to implement the transformation matrix
# Taipw = Sd1 - Sd0 + diag(lambda1) %*% (diag(N) - Sd1) - diag(lambda0) %*% (diag(N) - Sd0) # slow
T_aipw = Sd1 - Sd0 + lambda1 * (diag(N) - Sd1) - lambda0 * (diag(N) - Sd0)
and plug it into the weight maker to replicate the number:
omega_aipw = weight_maker(ones, ones, T_aipw)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_aipw %*% Y ), to_be_rep_aipw))
## ω'Y replicates standard implementation? TRUE
Wald AIPW
The moment based representation is the ratio of to average treatment effects estimated with AIPW:
Dtilde_ivaipw = Dhatz1 - Dhatz0 + Z / Zhat * (D - Dhatz1) - (1 - Z) / (1 - Zhat) * (D - Dhatz0)
Ytilde_ivaipw = Yhatz1 - Yhatz0 + Z / Zhat * (Y - Yhatz1) - (1 - Z) / (1 - Zhat) * (Y - Yhatz0)
ITT_y = mean(Ytilde_ivaipw)
ITT_d = mean(Dtilde_ivaipw)
to_be_rep_waipw = ITT_y / ITT_d
to_be_rep_waipw
## [1] 11308.88
Again the transformation matrix is the involved part but we also have a non-one pseudo-treatment:
# T_waipw = Sz1 - Sz0 + diag(lambdaz1) %*% (diag(n) - Sz1) - diag(lambdaz0) %*% (diag(n) - Sz0)
T_waipw = Sz1 - Sz0 + lambdaz1 * (diag(N) - Sz1) - lambdaz0 * (diag(N) - Sz0)
Dtilde_waipw = ones * ITT_d
omega_waipw = weight_maker(Dtilde_waipw, ones, T_waipw)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_waipw %*% Y ), to_be_rep_waipw))
## ω'Y replicates standard implementation? TRUE
Outcome weights of special cases
TSLS
We can similarly extract the outcome weights of TSLS to replicate the
standard implementation of the AER
package:
##
## Call:
## ivreg(formula = Y ~ D + X | X + Z)
##
## Residuals:
## Min 1Q Median 3Q Max
## -507731 -16906 -2147 10266 1431531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.239e+04 4.411e+03 -7.344 2.24e-13 ***
## D 8.483e+03 1.799e+03 4.716 2.43e-06 ***
## XAge 6.248e+02 5.993e+01 10.425 < 2e-16 ***
## XBenefit pension -4.514e+03 1.344e+03 -3.360 0.000783 ***
## XEducation -6.253e+02 2.282e+02 -2.740 0.006152 **
## XFamily size -1.067e+03 4.552e+02 -2.343 0.019150 *
## XHome owner 1.063e+03 1.323e+03 0.804 0.421557
## XIncome 9.283e-01 3.063e-02 30.309 < 2e-16 ***
## XMale -1.028e+03 1.513e+03 -0.679 0.496987
## XMarried 6.988e+02 1.814e+03 0.385 0.700108
## XIRA 2.912e+04 1.467e+03 19.853 < 2e-16 ***
## XTwo earners -1.933e+04 1.575e+03 -12.274 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55600 on 9903 degrees of freedom
## Multiple R-Squared: 0.2348, Adjusted R-squared: 0.234
## Wald test: 272 on 11 and 9903 DF, p-value: < 2.2e-16
to_be_rep_tsls = tsls$coefficient[2]
We now use residuals produced by the residual maker matrix as pseudo-instrument and -treatment, and the residual maker matrix as transformation matrix:
# Get residuals
Ztilde_tsls = lm(Z ~ X)$residuals
Dtilde_tsls = lm(D ~ X)$residuals
# Get projection matrix
Xcons = model.matrix(~ X)
PX = Xcons %*% solve(crossprod(Xcons)) %*% t(Xcons)
# Get residual maker matrix
MX = diag(N) - PX
# Create TSLS
omega_tsls = weight_maker(Dtilde_tsls, Ztilde_tsls, MX)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_tsls %*% Y ), as.numeric(to_be_rep_tsls)))
## ω'Y replicates standard implementation? TRUE
Wald estimator
The Wald estimator is replicated using the residual maker matrix of a constant:
# Get residuals
Ztilde_wald = lm(Z ~ 1)$residuals
Dtilde_wald = lm(D ~ 1)$residuals
# Get standard implementation
wald = ivreg(Y ~ D | Z)
to_be_rep_wald = wald$coefficient[2]
# Get projection and residual maker matrix of ones
P1 = ones %*% solve(crossprod(ones)) %*% t(ones)
M1 = diag(N) - P1
# Create TSLS
omega_wald = weight_maker(Dtilde_wald, Ztilde_wald, M1)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_wald %*% Y ), as.numeric(to_be_rep_wald)))
## ω'Y replicates standard implementation? TRUE
OLS
To replicate OLS, we can just recycle the linear treatment residuals from TSLS also as pseudo-instrument:
ols = lm(Y ~ D + X)
to_be_rep_ols = ols$coefficient[2]
# Create OLS weights
omega_ols = weight_maker(Dtilde_tsls, Dtilde_tsls, MX)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_ols %*% Y ), as.numeric(to_be_rep_ols)))
## ω'Y replicates standard implementation? TRUE
Difference in means
Finally, the difference in means estimator is recovered by recycling the Wald residuals:
dim = lm(Y ~ D)
to_be_rep_dim = dim$coefficient[2]
# Create DiM weights
omega_dim = weight_maker(Dtilde_wald, Dtilde_wald, M1)
cat("ω'Y replicates standard implementation?",
all.equal(as.numeric( omega_dim %*% Y ), as.numeric(to_be_rep_dim)))
## ω'Y replicates standard implementation? TRUE