Theory in action

Setting the stage

Managing expectations

This notebook illustrates the theoretical derivations of Knaus (2024). We keep everything as simple as possible meaning that

we use the cleaned and readily available 401(k) data of the hdm package, which includes a treatment and an instrument
we use the regression_forest() of the grf to estimate nuisance parameters because this is the only ML method with a build-in function to extract smoother weights
we hand-code two-fold cross-validation with regression_forest() and use the same nuisance parameters for different estimators
we do no tune the hyperparameters and stick to the default setting
we only consider causal/instrumental forest effect estimates for one unit and not for all 9915
we stick as close as possible to the paper notation

However, the code provides the basis to extend the analysis in different directions. To this end, click on the “Code” button at the top right and “Download Rmd”.

This notebooks is only for illustration. The OutcomeWeights R package implements everything in a more user friendly way.

Load packages and data

First, load the relevant packages

if (!require("cobalt")) install.packages("cobalt", dependencies = TRUE); library(cobalt)
if (!require("grf")) install.packages("grf", dependencies = TRUE); library(grf)
if (!require("hdm")) install.packages("hdm", dependencies = TRUE); library(hdm)
if (!require("gridExtra")) install.packages("gridExtra", dependencies = TRUE); library(gridExtra)
if (!require("tidyverse")) install.packages("tidyverse", dependencies = TRUE); library(tidyverse)
if (!require("AER")) install.packages("AER", dependencies = TRUE); library(AER)

and the data

data(pension) # Find variable description if you type ?pension in console
# Treatment
D = pension$p401
# Instrument
Z = pension$e401
# Outcome
Y = pension$net_tfa
# Controls
X = model.matrix(~ 0 + age + db + educ + fsize + hown + inc + male + marr + pira + twoearn, data = pension)
colnames(X) = c("Age","Benefit pension","Education","Family size","Home owner","Income","Male","Married","IRA","Two earners")

Define useful quantities and set the seed:

# Helpful variables
N = length(Y)
ones = matrix(1,N,1)

# Choose unit for which causal/instrumental forest weights should be calculated
unit = 1

# Define number of folds for cross-fitting
nfolds = 2

set.seed(1234)

One function to calculate them all

Proposition 1 in the paper shows that the outcome weights vector can be obtained in the general form \boldsymbol{\omega'} = (\boldsymbol{\tilde{Z}'\tilde{D}})^{-1} \boldsymbol{\tilde{Z}'T}

where \boldsymbol{\tilde{Z}}, \boldsymbol{\tilde{D}} and \boldsymbol{T} are pseudo-instrument, pseudo-treatment and the transformation matrix, respectively.

This motivates the definition of the weight_maker(Dtilde,Ztilde,T_mat) function taking the three generic inputs and producing the outcome weights vector:

weight_maker = function(Dtilde,Ztilde,T_mat) {
  omega = (t(Ztilde) %*% Dtilde)^(-1) %*% t(Ztilde) %*% T_mat
  return(omega)
}

In the following, we just need to define the three components for the respective estimators and plug them into this generic function to obtain the outcome weights.

Estimate all nuisance parameters and get outcome smoother matrices

Equation (8) in the paper shows which nuisance parameters are required for the estimators under consideration. Here, we estimate them all using honest random forest and 2-fold cross-fitting. Additionally, we extract the relevant smoother matrices \boldsymbol{S}, \boldsymbol{S^d_0}, \boldsymbol{S^d_1}, \boldsymbol{S^z_0}, and \boldsymbol{S^z_1}.

# Get fold number
fold = sample(1:nfolds,N,replace=T)

# Initialize nuisance parameters
Yhat = Yhatd0 = Yhatd1 = Yhatz0 = Yhatz1 = 
  Dhat = Dhatz0 = Dhatz1 = Zhat = rep(NA,N)

# Initialize smoother matrices
S = Sd0 = Sd1 = Sz0 = Sz1 = matrix(0,N,N)

for (i in 1:nfolds){
  # Calculate nuisance parameters and relevant smoother matrices
  rf_Yhat = regression_forest(X[fold != i,],Y[fold != i])
  Yhat[fold == i] = predict(rf_Yhat, X[fold == i,])$predictions
  S[fold == i, fold != i] = as.matrix(get_forest_weights(rf_Yhat, X[fold == i,]))

  rf_Yhatd0 = regression_forest(X[fold != i & D==0,],Y[fold != i & D==0])
  Yhatd0[fold == i] = predict(rf_Yhatd0, X[fold == i,])$predictions
  Sd0[fold == i, fold != i  & D==0] = as.matrix(get_forest_weights(rf_Yhatd0, X[fold == i,]))

  rf_Yhatd1 = regression_forest(X[fold != i & D==1,],Y[fold != i & D==1])
  Yhatd1[fold == i] = predict(rf_Yhatd1, X[fold == i,])$predictions
  Sd1[fold == i, fold != i  & D==1] = as.matrix(get_forest_weights(rf_Yhatd1, X[fold == i,]))

  rf_Yhatz0 = regression_forest(X[fold != i & Z==0,],Y[fold != i & Z==0])
  Yhatz0[fold == i] = predict(rf_Yhatz0, X[fold == i,])$predictions
  Sz0[fold == i, fold != i  & Z==0] = as.matrix(get_forest_weights(rf_Yhatz0, X[fold == i,]))
  
  rf_Yhatz1 = regression_forest(X[fold != i & Z==1,],Y[fold != i & Z==1])
  Yhatz1[fold == i] = predict(rf_Yhatz1, X[fold == i,])$predictions
  Sz1[fold == i, fold != i  & Z==1] = as.matrix(get_forest_weights(rf_Yhatz1, X[fold == i,]))
  
  rf_Dhat = regression_forest(X[fold != i,],D[fold != i])
  Dhat[fold == i] = predict(rf_Dhat, X[fold == i,])$predictions
  
  rf_Dhatz0 = regression_forest(X[fold != i & Z==0,],D[fold != i & Z==0])
  Dhatz0[fold == i] = predict(rf_Dhatz0, X[fold == i,])$predictions
  
  rf_Dhatz1 = regression_forest(X[fold != i & Z==1,],D[fold != i & Z==1])
  Dhatz1[fold == i] = predict(rf_Dhatz1, X[fold == i,])$predictions
  
  rf_Zhat = regression_forest(X[fold != i,],Z[fold != i])
  Zhat[fold == i] = predict(rf_Zhat, X[fold == i,])$predictions
}

For those not familiar with smoothers, let’s observe that they produce exactly the same predictions as the original predict() functions:

cat("Smoother matrices replicate predicted nuisance parameter?",
 all(
  all.equal(as.numeric(S %*% Y), Yhat) == TRUE,
  all.equal(as.numeric(Sd0 %*% Y), Yhatd0) == TRUE,
  all.equal(as.numeric(Sd1 %*% Y), Yhatd1) == TRUE,
  all.equal(as.numeric(Sz0 %*% Y), Yhatz0) == TRUE,
  all.equal(as.numeric(Sz1 %*% Y), Yhatz1) == TRUE)
)

## Smoother matrices replicate predicted nuisance parameter? TRUE

Also observe that random forest is an affine smoother because all rows add up exactly to one:

cat("Affine smoother matrices?",
all(
  all.equal(as.numeric(ones),as.numeric(rowSums(S))) == TRUE,
  all.equal(as.numeric(ones),as.numeric(rowSums(Sd0))) == TRUE,
  all.equal(as.numeric(ones),as.numeric(rowSums(Sd1))) == TRUE,
  all.equal(as.numeric(ones),as.numeric(rowSums(Sz0))) == TRUE,
  all.equal(as.numeric(ones),as.numeric(rowSums(Sz1))) == TRUE)
)

## Affine smoother matrices? TRUE

Finally, define the different IPW weights for later use:

lambda1 = D / Dhat
lambda0 = (1-D) / (1-Dhat)
lambdaz1 = Z / Zhat
lambdaz0 = (1-Z) / (1-Zhat)

Outcome weights of DML and GRF

Instrumental forest

To be replicated

Here, we want to replicate the CLATE estimate for unit 1. To this end, we first run instrumental_forest() with the pre-defined nuisance parameters:

# Run IF with the pre-specified nuisance parameters
ivf = instrumental_forest(X,Y,D,Z,Y.hat=Yhat,W.hat=Dhat,Z.hat=Zhat)
# Get CLATEs 
clates_if = predict(ivf)$predictions

The estimated CLATE for unit 1 is then 6421.6180875, which becomes the number to be replicated:

to_be_rep_if = clates_if[unit]
to_be_rep_if

## [1] 6421.618

Nearly replication

According to the main text of the paper, we need the following components \boldsymbol{\hat{R}} = \boldsymbol{Z} - \boldsymbol{\hat{Z}}, \quad \boldsymbol{\hat{V}} = \boldsymbol{D} - \boldsymbol{\hat{D}}, \quad \boldsymbol{\hat{U}} = \boldsymbol{Y} - \boldsymbol{\hat{Y}}, \quad \alpha^{if}(\boldsymbol{x}) that are defined in the following

Uhat = Y - Yhat
Vhat = D - Dhat
Rhat = Z - Zhat
alpha_if = get_forest_weights(ivf)[unit,]

Now define the pseudo-variables and the transformation matrix:

Ztilde_if = Rhat * alpha_if
Dtilde_if = Vhat
Ytilde_if = Uhat
T_if = diag(N) - S

As a sanity check observe how the transformation matrix replicates the pseudo-outcome:

cat("TY replicates pseudo-outcome?",
  all.equal(as.numeric( T_if %*% Y ), Uhat))

## TY replicates pseudo-outcome? TRUE

Now we pass the required components to the weight_maker() function

omega_if = weight_maker(Dtilde_if, Ztilde_if, T_if)
repl_wo_const = omega_if %*% Y

This estimates an effect of 6405.4799565, which is very close but not identical to the number to be replicated:

all.equal(as.numeric( repl_wo_const ), to_be_rep_if)

## [1] "Mean relative difference: 0.002519426"

Exact replication

As noted in Appendix A.3.1, the grf package applies an additional constant which requires to adjust the pseudo-instrument:

# Create weighted residual maker matrix
M1_inv = solve(t(ones) %*% diag(alpha_if) %*% ones)
P1 = (ones %*% M1_inv) %*% (t(ones) %*% diag(alpha_if))
M1 = diag(N) - P1
# Get modified pseudo-instrument
Ztilde_if = (M1 %*% Rhat) * alpha_if

Plugging this new pseudo-instrument into the weight_maker() function creates outcome weights that now perfectly replicate the package output:

# Get if outcome weights
omega_if = weight_maker(Dtilde_if, Ztilde_if, T_if) 
cat("ω'Y replicates package output?",all.equal(as.numeric( omega_if %*% Y ), to_be_rep_if))

## ω'Y replicates package output? TRUE

Partially linear IV regression

To obtain the standard partially linear IV regression, we first solve the canonical moment condition to see which value should be replicated:

to_be_rep_plriv = mean(Rhat * Uhat) / mean(Rhat * Vhat)
to_be_rep_plriv

## [1] 12570.88

Compared to IF, we only need to pass a different pseudo-instrument to the weight_maker to replicate this number:

# Modified pseudo-instrument
Ztilde_plriv = Rhat * ones
# Get outcome weights
omega_plriv = weight_maker(Dtilde_if, Ztilde_plriv, T_if) 
cat("ω'Y replicates moment based implentation?",all.equal(as.numeric( omega_plriv %*% Y ), to_be_rep_plriv))

## ω'Y replicates moment based implentation? TRUE

Causal forest

To be replicated

Here, we want to replicate the CATE estimate for unit 1. To this end, we first run the causal_forest() with the pre-defined nuisance parameters:

cf = causal_forest(X,Y,D,Y.hat=Yhat,W.hat=Dhat)
cates_cf = predict(cf)$predictions

The estimated CATE for unit 1 is then 5588.963803, which becomes the number to be replicated:

to_be_rep_cf = cates_cf[unit]

Exact replication

Causal forest works similar to instrumental forest but requires weight \alpha^{cf}(\boldsymbol{x}) to enter the pseudo-instrument:

alpha_cf = get_forest_weights(cf)[unit,]
M1_inv = solve(t(ones) %*% diag(alpha_cf) %*% ones)
P1 = (ones %*% M1_inv) %*% (t(ones) %*% diag(alpha_cf))
M1 = diag(N) - P1
# Get pseudo-instrument
Ztilde_cf = (M1 %*% Vhat) * alpha_cf

Plugging this new pseudo-instrument into the weight_maker() function again perfectly replicates the package output:

# Get cf outcome weights
omega_cf = weight_maker(Dtilde_if, Ztilde_cf, T_if) 
cat("ω'Y replicates package output?",
    all.equal(as.numeric( omega_cf %*% Y ), to_be_rep_cf))

## ω'Y replicates package output? TRUE

Partially linear regression

To obtain the standard partially linear regression, we first solve the canonical moment condition to see which value should be replicated:

to_be_rep_plr = mean(Vhat * Uhat) / mean(Vhat * Vhat)
to_be_rep_plr

## [1] 13576.51

Compared to CF, we only need to pass a different pseudo-instrument to the weight_maker() function to replicate this number:

# Modified pseudo-instrument
Ztilde_plr = Vhat * ones
# Get outcome weights
omega_plr = weight_maker(Dtilde_if, Ztilde_plr, T_if) 
cat("ω'Y replicates moment based implementation?",
    all.equal(as.numeric( omega_plr %*% Y ), to_be_rep_plr))

## ω'Y replicates moment based implementation? TRUE

Augmented inverse probability weighting

The AIPW uses a large pseudo-outcome and takes the mean of it:

Ytilde_aipw = Yhatd1 - Yhatd0 + lambda1 * (Y - Yhatd1) - lambda0 * (Y- Yhatd0)
to_be_rep_aipw = mean(Ytilde_aipw)
to_be_rep_aipw

## [1] 11521.92

To replicate this number with outcome weights, we need to implement the transformation matrix

# Taipw = Sd1 - Sd0 + diag(lambda1) %*% (diag(N) - Sd1) - diag(lambda0) %*% (diag(N) - Sd0) # slow
T_aipw = Sd1 - Sd0 + lambda1 * (diag(N) - Sd1) - lambda0 * (diag(N) - Sd0)

and plug it into the weight maker to replicate the number:

omega_aipw = weight_maker(ones, ones, T_aipw) 
cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_aipw %*% Y ), to_be_rep_aipw))

## ω'Y replicates standard implementation? TRUE

Wald AIPW

The moment based representation is the ratio of to average treatment effects estimated with AIPW:

Dtilde_ivaipw = Dhatz1 - Dhatz0 + Z / Zhat * (D - Dhatz1) - (1 - Z) / (1 - Zhat) * (D - Dhatz0)

Ytilde_ivaipw = Yhatz1 - Yhatz0 + Z / Zhat * (Y - Yhatz1) - (1 - Z) / (1 - Zhat) * (Y - Yhatz0)

ITT_y = mean(Ytilde_ivaipw)
ITT_d = mean(Dtilde_ivaipw)

to_be_rep_waipw = ITT_y / ITT_d
to_be_rep_waipw

## [1] 11308.88

Again the transformation matrix is the involved part but we also have a non-one pseudo-treatment:

# T_waipw = Sz1 - Sz0 + diag(lambdaz1) %*% (diag(n) - Sz1) - diag(lambdaz0) %*% (diag(n) - Sz0)
T_waipw = Sz1 - Sz0 + lambdaz1 * (diag(N) - Sz1) - lambdaz0 * (diag(N) - Sz0)

Dtilde_waipw = ones * ITT_d

omega_waipw = weight_maker(Dtilde_waipw, ones, T_waipw)
cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_waipw %*% Y ), to_be_rep_waipw))

## ω'Y replicates standard implementation? TRUE

Outcome weights of special cases

TSLS

We can similarly extract the outcome weights of TSLS to replicate the standard implementation of the AER package:

tsls = ivreg(Y ~ D + X  | X + Z)
summary(tsls)

## 
## Call:
## ivreg(formula = Y ~ D + X | X + Z)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -507731  -16906   -2147   10266 1431531 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -3.239e+04  4.411e+03  -7.344 2.24e-13 ***
## D                 8.483e+03  1.799e+03   4.716 2.43e-06 ***
## XAge              6.248e+02  5.993e+01  10.425  < 2e-16 ***
## XBenefit pension -4.514e+03  1.344e+03  -3.360 0.000783 ***
## XEducation       -6.253e+02  2.282e+02  -2.740 0.006152 ** 
## XFamily size     -1.067e+03  4.552e+02  -2.343 0.019150 *  
## XHome owner       1.063e+03  1.323e+03   0.804 0.421557    
## XIncome           9.283e-01  3.063e-02  30.309  < 2e-16 ***
## XMale            -1.028e+03  1.513e+03  -0.679 0.496987    
## XMarried          6.988e+02  1.814e+03   0.385 0.700108    
## XIRA              2.912e+04  1.467e+03  19.853  < 2e-16 ***
## XTwo earners     -1.933e+04  1.575e+03 -12.274  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55600 on 9903 degrees of freedom
## Multiple R-Squared: 0.2348,  Adjusted R-squared: 0.234 
## Wald test:   272 on 11 and 9903 DF,  p-value: < 2.2e-16

to_be_rep_tsls = tsls$coefficient[2]

We now use residuals produced by the residual maker matrix as pseudo-instrument and -treatment, and the residual maker matrix as transformation matrix:

# Get residuals
Ztilde_tsls = lm(Z ~ X)$residuals
Dtilde_tsls = lm(D ~ X)$residuals

# Get projection matrix
Xcons = model.matrix(~ X)
PX = Xcons %*% solve(crossprod(Xcons)) %*% t(Xcons)
# Get residual maker matrix
MX = diag(N) - PX

# Create TSLS
omega_tsls = weight_maker(Dtilde_tsls, Ztilde_tsls, MX) 

cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_tsls %*% Y ), as.numeric(to_be_rep_tsls)))

## ω'Y replicates standard implementation? TRUE

Wald estimator

The Wald estimator is replicated using the residual maker matrix of a constant:

# Get residuals
Ztilde_wald = lm(Z ~ 1)$residuals
Dtilde_wald = lm(D ~ 1)$residuals

# Get standard implementation
wald = ivreg(Y ~ D | Z)
to_be_rep_wald = wald$coefficient[2]

# Get projection and residual maker matrix of ones
P1 = ones %*% solve(crossprod(ones)) %*% t(ones)
M1 = diag(N) - P1

# Create TSLS
omega_wald = weight_maker(Dtilde_wald, Ztilde_wald, M1) 

cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_wald %*% Y ), as.numeric(to_be_rep_wald)))

## ω'Y replicates standard implementation? TRUE

OLS

To replicate OLS, we can just recycle the linear treatment residuals from TSLS also as pseudo-instrument:

ols = lm(Y ~ D + X)
to_be_rep_ols = ols$coefficient[2]

# Create OLS weights
omega_ols = weight_maker(Dtilde_tsls, Dtilde_tsls, MX) 

cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_ols %*% Y ), as.numeric(to_be_rep_ols)))

## ω'Y replicates standard implementation? TRUE

Difference in means

Finally, the difference in means estimator is recovered by recycling the Wald residuals:

dim = lm(Y ~ D)
to_be_rep_dim = dim$coefficient[2]

# Create DiM weights
omega_dim = weight_maker(Dtilde_wald, Dtilde_wald, M1) 

cat("ω'Y replicates standard implementation?",
    all.equal(as.numeric( omega_dim %*% Y ), as.numeric(to_be_rep_dim)))

## ω'Y replicates standard implementation? TRUE

Michael C. Knaus

05/25

Setting the stage

Managing expectations

Load packages and data

One function to calculate them all

Estimate all nuisance parameters and get outcome smoother matrices

Outcome weights of DML and GRF

Instrumental forest

To be replicated

Nearly replication

Exact replication

Partially linear IV regression

Causal forest

To be replicated

Exact replication

Partially linear regression

Augmented inverse probability weighting

Wald AIPW

Outcome weights of special cases

TSLS

Wald estimator

OLS

Difference in means