--- title: "Comparison of Methods" author: "Robin J. Evans" date: "25/05/2021" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Comparison of Methods} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, echo = TRUE, comment = "#>", cache = FALSE, fig.width=7, fig.height=7 ) # knitr::knit_theme$set("earendel") ``` As always, we begin by loading the package. ```{r load, include=TRUE, message=FALSE} library(causl) library(purrr) library(survey) ``` ## Simulate Data We first select the variables in our model, choosing the ones mentioned in the running example of Evans and Didelez (2021). In this case the model is given by the graph $A_0 \rightarrow L \rightarrow A_1 \rightarrow Y$ with $A_0 \rightarrow A_1$ and $L \leftrightarrow Y$. ```{r formulas} forms <- list(L ~ A0, list(A0 ~ 1, A1 ~ A0*L), Y ~ A0*A1, ~ A0) ``` We next select the parameters for our model, again following Evans and Didelez (2024). ```{r params} pars <- list(A0 = list(beta = 0), L = list(beta = c(0.3,-0.2), phi=1), A1 = list(beta = c(-0.3,0.4,0.3,0)), Y = list(beta = c(-0.5,0.2,0.3,0), phi=1), cop = list(beta = c(1,0.5))) ``` Now we sample $5,000$ observations from the model (you might like to edit this to increase the sample size to $10^6$, replicating the analysis in Evans and Didelez, 2024): ```{r simulate} set.seed(123) n <- 5e3 dat_max <- causalSamp(n, formulas = forms, pars=pars, family = list(3,c(5,5),1,1)) ``` ```{r echo=FALSE, eval=FALSE} library(tidyverse) dat_max <- dat_max %>% tibble %>% mutate(`(A,B)`=factor(A0+2*A1)) levels(dat_max$`(A,B)`) <- c("(0,0)", "(1,0)", "(0,1)", "(1,1)") ggplot(dat_max[1:1e3,], aes(x=log(L),y=Y, color=`(A,B)`)) + geom_point() + guides(fill=guide_legend(title="(A,B)")) + xlab("log(Z)") ``` Now we can check that the distribution actually has the correct form for the first three variables ($A_0, L, A_1$): ```{r glms1, echo=-1} options(digits=3) summary(glm(A0 ~ 1, family=binomial, data=dat_max))$coef summary(glm(L ~ A0, family=Gamma(link="log"), data=dat_max))$coef glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat_max) summary(glmA1)$coef ``` Indeed, all the parameters are close to their correct values. We can also use inverse probability weighting to check the causal relationship for $Y$. ```{r chk} ps <- fitted(glmA1) wt <- dat_max$A1/ps + (1-dat_max$A1)/(1-ps) summary(svyglm(Y ~ A0*A1, design = svydesign(~1, weights=~wt, data = dat_max)))$coef ``` For the remainder of this vignette, we will only use the first 1,000 entries of this dataset (change this to the commented out line to replicate results from our paper). ```{r dat, echo=TRUE} dat <- dat_max[seq_len(2e3), ] # dat <- dat_max[seq_len(1e4), ] ``` ### Outcome Regression We start with a naïve outcome regression approach, where we fit a linear model for $Y$ regressed on various combinations of $A_0,A_1$ and $L$. As we can see, none yield the parameters that interest us. ```{r outcome} lmY_A0A1 <- lm(Y ~ A0*A1, data=dat) lmY_A0A1_L <- lm(Y ~ A0*A1 + L, data=dat) lmY_A0A1L <- lm(Y ~ A0*A1*L, data=dat) summary(lmY_A0A1)$coef summary(lmY_A0A1_L)$coef summary(lmY_A0A1L)$coef ``` ```{r tab_or, echo=FALSE} tab_or <- summary(lmY_A0A1)$coef[,1:2] tab_or <- cbind(tab_or, tab_or[,1] - pars$Y$beta) colnames(tab_or) <- c("Est.", "SE", "Bias") ``` ### Inverse Propensity Weighting We can try the rather more principled approach of using inverse propensity score weighting, and this time the estimates are unbiased. ```{r ipw} ## get the weights from model for A1 glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat) ps <- fitted(glmA1) wt <- dat$A1/ps + (1-dat$A1)/(1-ps) lmY_A0A1_w <- svyglm(Y ~ A0*A1, design = svydesign(id=~1, data=dat, weights = ~wt)) summary(lmY_A0A1_w)$coef ``` Notice that the coefficients are now within their standard errors. ```{r tab_ipw, echo=FALSE} tab_ipw <- summary(lmY_A0A1_w)$coef[,1:2] tab_ipw <- cbind(tab_ipw, tab_ipw[,1] - pars$Y$beta) colnames(tab_ipw) <- c("Est.", "SE", "Bias") ``` ### Doubly Robust Approach We can also use an approach based on doubly-robust estimating equations. ```{r dr, eval=TRUE} ## get datasets with different values of A1 dat0 <- dat1 <- dat dat0$A1 <- 0 dat1$A1 <- 1 ## get outcome models glmY <- lm(Y ~ A0+A1+I(log(L)), data=dat) q <- predict(glmY, dat) q0 <- predict(glmY, dat0) q1 <- predict(glmY, dat1) n0 <- sum(dat$A0 == 0) n1 <- sum(dat$A0 == 1) ## weights w1 <- fitted(glmA1) w1[dat$A1==0] <- 1 - w1[dat$A1==0] w0 <- rep(1, nrow(dat)) # w0 <- predict(glmA0, dat, "response") # w0[dat$A0==0] <- 1 - w0[dat$A0==0] w <- w0 * w1 ## obtain E[Y | do(A0=a0,A1=a1)] for each (a0,a1) wts01 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 0] wts00 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 0] wts11 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 1] wts10 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 1] se00 <- sd(wts00)/sqrt(n0) se10 <- sd(wts10)/sqrt(n1) se01 <- sd(wts01)/sqrt(n0) se11 <- sd(wts11)/sqrt(n1) cse00_01 <- mean((wts00 - mean(wts00))*(wts01 - mean(wts01)))/n0 cse10_11 <- mean((wts10 - mean(wts10))*(wts11 - mean(wts11)))/n1 ## use these to obtain estimates, standard errors and bias est <- c(mean(wts00), mean(wts10) - mean(wts00), mean(wts01 - wts00), mean(wts11 - wts10) - mean(wts01 - wts00)) se <- c(se00, sqrt(se10^2 - 2*cse00_01 + se00^2), sqrt(se01^2 + se00^2), sqrt(se10^2 - 2*cse00_01 + se00^2) + sqrt(se10^2 - 2*cse10_11 + se11^2)) bias <- est - pars$Y$beta tab_dr <- cbind(est, se, bias) rownames(tab_dr) <- rownames(tab_ipw) colnames(tab_dr) <- c("Est.", "SE", "Bias") tab_dr ``` ### Maximum Likelihood Finally, we can fit using our own code with the black-box optimizer, and since we are fitting the correct model it is guaranteed to be consistent and asymptotically efficient. ```{r, echo=FALSE} knitr::opts_chunk$set( collapse = TRUE, # eval = FALSE, # echo = FALSE, comment = "#>", cache = FALSE, fig.width=7, fig.height=7 ) ``` ```{r mle, cache=TRUE} modY <- fitCausal(dat, formulas = list(Y ~ A0*A1, L ~ A0, ~ A0*A1), family = c(1,3,1), control=list(maxit=2e4, newton=TRUE)) modY ``` ```{r tab_mle, echo=FALSE, cache=TRUE} tab_mle <- cbind(modY$pars$Y$beta[1:4], modY$pars$Y$beta_sandwich[1:4], modY$pars$Y$beta[1:4]-pars$Y$beta) colnames(tab_mle) <- c("Est.", "SE", "Bias") ``` ### Comparison of Results Outcome regression fails miserably, but this is to be expected because the model is hopelessly misspecified. IP weighting, double robust estimates and the MLE all appear to be correct. ```{r results, echo=FALSE, results="asis"} results <- cbind(tab_or, tab_ipw, tab_dr, tab_mle) results[,1+rep(0:3, each=1)*3] <- round(results[,1+rep(0:3, each=1)*3], 2) results[,2+rep(0:3, each=1)*3] <- round(results[,2+rep(0:3, each=1)*3], 3) results[,3+(0:3)*3] <- round(results[,3+(0:3)*3], 3) results[,ncol(results)] <- round(results[,ncol(results)], 3) kableExtra::kbl(results, booktabs=TRUE, format="html") %>% ## change to format="latex" for paper kableExtra::add_header_above(c(" ","Outcome Regression"=3,"IP Weighting"=3,"Double Robust"=3,"MLE"=3)) ``` The code below (which we do not run because it takes over a minute) will allow you to compare $N=10^3$ results each with sample size $n=250$ via a naïve regression and inverse probability weighting. ```{r params2, echo=FALSE, eval=FALSE} pars <- list(A0 = list(beta = 0), L = list(beta = c(0,-1), phi=1), A1 = list(beta = c(0,0.5,0.5,0)), Y = list(beta = c(-1,0.5,0.5,0), phi=1), cop = list(beta = c(1,0.5))) ``` ```{r formulas2, echo=FALSE, eval=FALSE} forms <- list(L ~ A0, list(A0 ~ 1, A1 ~ A0*L), Y ~ A0*A1, ~ A0) ``` ```{r mult_sims, eval=FALSE} set.seed(234) n <- 250 N <- 1e3 out_ipw <- matrix(NA, N, 4) colnames(out_ipw) <- c("int", "A0", "A1", "A0_A1") out_or <- se_ipw <- se_or <- out_ipw out_mle <- se_mle <- out_ipw for (i in seq_len(N)) { dat <- causalSamp(n, formulas = forms, pars = pars, family = list(3,c(5,5),1,1)) ## get naive estimates lm_or <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights = rep(1,nrow(dat)), data=dat))) out_or[i,] <- lm_or$coef[,1] se_or[i,] <- lm_or$coef[,2] ## get weights for IPW glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat) tmp <- fitted(glmA1) wts <- dat$A1/tmp + (1-dat$A1)/(1-tmp) ## get IPW estimates lm_ipw <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights=wts, data=dat))) out_ipw[i,] <- lm_ipw$coef[,1] se_ipw[i,] <- lm_ipw$coef[,2] ## get MLEs tmp <- fitCausal(dat, formulas = forms[-2], family = list(3,1,1)) out_mle[i,] <- tmp$pars$Y$beta se_mle[i,] <- tmp$pars$Y$beta_sandwich printCount(i) } ``` ```{r boxplot, eval=FALSE, echo=FALSE} library(dplyr) library(tidyr) library(ggplot2) out_or <- as_tibble(out_or) se_or <- as_tibble(se_or) out_ipw <- as_tibble(out_ipw) se_ipw <- as_tibble(se_ipw) out_mle <- as_tibble(out_mle) se_mle <- as_tibble(se_mle) # out_ipw %>% colMeans() # out_or %>% colMeans() bias_ipw <- out_ipw - rep(pars$Y$beta, each=nrow(out_ipw)) bias_or <- out_or - rep(pars$Y$beta, each=nrow(out_or)) bias_mle <- out_mle - rep(pars$Y$beta, each=nrow(out_mle)) bias_or <- pivot_longer(bias_or, int:A0_A1, names_to="coef", values_to="bias") bias_or <- bias_or %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1"))) bias_ipw <- pivot_longer(bias_ipw, int:A0_A1, names_to="coef", values_to="bias") bias_ipw <- bias_ipw %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1"))) bias_mle <- pivot_longer(bias_mle, int:A0_A1, names_to="coef", values_to="bias") bias_mle <- bias_mle %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1"))) ggplot(bias_ipw, aes(x=coef, y=bias, fill=coef)) + geom_boxplot() + theme_bw() + theme(legend.position = "none", axis.text = element_text(size=12), axis.title = element_text(size=12)) + xlab("coefficient") + ylim(c(-0.45,0.45)) + scale_x_discrete(labels=c("intercept","A","B","A*B")) ggplot(bias_or, aes(x=coef, y=bias, fill=coef)) + geom_boxplot() + theme_bw() + theme(legend.position = "none", axis.text = element_text(size=12), axis.title = element_text(size=12)) + xlab("coefficient") + ylim(c(-0.45,0.45)) + scale_x_discrete(labels=c("intercept","A","B","A*B")) ggplot(bias_mle, aes(x=coef, y=bias, fill=coef)) + geom_boxplot() + theme_bw() + theme(legend.position = "none", axis.text = element_text(size=12), axis.title = element_text(size=12)) + xlab("coefficient") + ylim(c(-0.45,0.45)) + scale_x_discrete(labels=c("intercept","A","B","A*B")) ```