Draw something using two potential outcome functions and check how well causal forests can approximate the CATE function resulting from your drawing. You only need to modify the bold labels and the first code chunk. Send me your solutions via mail by 22.12. and/or post your results under my Bluesky post.
I have drawn a (stylized) christmas tree using two potential outcomes functions.
This defines the functions and plots them. Your task is to modify this part:
if (!require("tidyverse")) install.packages("tidyverse", dependencies = TRUE); library(tidyverse)
if (!require("grf")) install.packages("grf", dependencies = TRUE); library(grf)
if (!require("patchwork")) install.packages("patchwork", dependencies = TRUE); library(patchwork)
# Define the functions
cw = 0.05
ch = 0.6
m1 = function(x){(x < -0.8) * (3*x + 3) + ((x >= -0.8) & (x < (-0.8+cw))) * (-0.8*3 + 3 + ch) + ((x>(-0.8+cw)) & (x < -0.5)) * (3*x + 3) + ((x >= -0.5) & (x < (-0.5+cw))) * (-0.5*3 + 3 + ch) + ((x>(-0.5+cw)) & (x < -0.2)) * (3*x + 3) + ((x >= -0.2) & (x < (-0.2+cw))) * (-0.2*3 + 3 + ch) + ((x>(-0.2+cw)) & (x < 0)) * (3*x + 3) + (x >= 0 & x < 0.2) * (-3*x + 3) + ((x >= 0.2) & (x < (0.2+cw))) * (-0.2*3 + 3 + ch) + ((x>(0.2+cw)) & (x < 0.5)) * (-3*x + 3) + ((x >= 0.5) & (x < (0.5+cw))) * (-0.5*3 + 3 + ch) + ((x>(0.5+cw)) & (x < 0.8)) * (-3*x + 3) + ((x >= 0.8) & (x < (0.8+cw))) * (-0.8*3 + 3 + ch) + ((x>(0.8+cw))) * (-3*x + 3) }
m0 = function(x){0 * ((x<(-0.7)) + (x>(0.07)))
- 0.8 * ((x>(-0.07) & x<(0.07))) }
# Plot the two potential outcome fcts
g2 = data.frame(x = c(-1, 1)) %>% ggplot(aes(x)) + stat_function(fun=m1,size=1,colour="forestgreen") +
stat_function(fun=m0,size=1,colour="forestgreen") + ylab("Y(w)") + xlab("X1") + theme_minimal()
g2
Now plot the resulting CATE:
tau = function(x){m1(x) - m0(x)}
g3 = data.frame(x = c(-1, 1)) %>% ggplot(aes(x)) + stat_function(fun=tau,size=1) + ylab("CATE") + xlab("X1") + theme_minimal()
g3
Now we investigate how well causal forests approximate the resulting CATE function for different sample sizes.
To this end we define a little function that takes the two potential outcome functions as inputs, draws a random treatment \(W \sim Bernoulli(1/2)\) and adds standard normal noise to the outcome. Then it runs for sample sizes 100/1000/10000/100000 and we observe how visual fit and RMSE improve (runs about 15 minutes on my laptop, but feel free to change it to reduce computation time).
cf_estimation = function(m1,m0,n,p=2,...) {
# Get CATE function
tau = function(x){m1(x) - m0(x)}
# Draw sample
X = matrix(runif(n*p,-1,1),ncol=p)
W = rbinom(n,1,1/2)
Y = W*m1(X[,1]) + (1-W)*m0(X[,1]) + rnorm(n,0,1)
# Run CF
cf = causal_forest(X, Y, W, ...)
cates = predict(cf)$predictions
# Plot
g = data.frame(x=X[,1],y=cates) %>% ggplot() + geom_point(aes(x=x,y=y),shape="square",color="blue") +
stat_function(fun=tau,size=1) + ylab("CATE") + ggtitle(paste0("n=",toString(n)))
# RMSE
rmse = sqrt(mean((cates - tau(X[,1]))^2))
# Return results
list("g" = g,"RMSE" = rmse)
}
set.seed(1234)
n100 = cf_estimation(m1,m0,100,tune.parameters = "all")
n1000 = cf_estimation(m1,m0,1000,tune.parameters = "all")
n10000 = cf_estimation(m1,m0,10000,tune.parameters = "all")
n100000 = cf_estimation(m1,m0,100000,tune.parameters = "all")
(n100$g | n1000$g) / (n10000$g | n100000$g)
data.frame(RMSE = c(n100$RMSE,n1000$RMSE,n10000$RMSE,n100000$RMSE),
n = factor(c("n=100","n=1000","n=10000","n=100000"))) %>%
ggplot(aes(x=n,y=RMSE)) + geom_point() + theme_bw() + geom_hline(yintercept = 0)