---
title: "Comparison of Methods"
author: "Robin J. Evans"
date: "25/05/2021"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Comparison of Methods}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  echo = TRUE,
  comment = "#>",
  cache = FALSE,
  fig.width=7, 
  fig.height=7
)
# knitr::knit_theme$set("earendel")
```

As always, we begin by loading the package.
```{r load, include=TRUE, message=FALSE}
library(causl)
library(purrr)
library(survey)
```

## Simulate Data

We first select the variables in our model, choosing the ones mentioned 
in the running example of Evans and Didelez (2021).  In this case the model is
given by the graph $A_0 \rightarrow L \rightarrow A_1 \rightarrow Y$ with
$A_0 \rightarrow A_1$ and $L \leftrightarrow Y$.
```{r formulas}
forms <- list(L ~ A0, 
              list(A0 ~ 1, A1 ~ A0*L),
              Y ~ A0*A1, 
              ~ A0)
```

We next select the parameters for our model, again following 
Evans and Didelez (2024).

```{r params}
pars <- list(A0 = list(beta = 0),
             L = list(beta = c(0.3,-0.2), phi=1),
             A1 = list(beta = c(-0.3,0.4,0.3,0)),
             Y = list(beta = c(-0.5,0.2,0.3,0), phi=1),
             cop = list(beta = c(1,0.5)))
```

Now we sample $5,000$ observations from the model (you might like to edit this
to increase the sample size to $10^6$, replicating the analysis in Evans and 
Didelez, 2024):
```{r simulate}
set.seed(123)
n <- 5e3
dat_max <- causalSamp(n, formulas = forms, pars=pars, family = list(3,c(5,5),1,1))
```

```{r echo=FALSE, eval=FALSE}
library(tidyverse)
dat_max <- dat_max %>% tibble %>% 
  mutate(`(A,B)`=factor(A0+2*A1))
levels(dat_max$`(A,B)`) <- c("(0,0)", "(1,0)", "(0,1)", "(1,1)")
ggplot(dat_max[1:1e3,], aes(x=log(L),y=Y, color=`(A,B)`)) + 
  geom_point() + 
  guides(fill=guide_legend(title="(A,B)")) + 
  xlab("log(Z)")
```

<!-- To get the dataset of size $10^4$ used in Appendix D, run the following code. -->
<!-- ```{r simulate, eval=FALSE} -->
<!-- set.seed(123) -->
<!-- n <- 1e4 -->
<!-- dat_max <- rfrugalParam2(n, formulas = forms, pars=pars, family = list(3,c(5,5),1,1)) -->
<!-- ``` -->
Now we can check that the distribution actually has the correct form for the
first three variables ($A_0, L, A_1$):
```{r glms1, echo=-1}
options(digits=3)
summary(glm(A0 ~ 1, family=binomial, data=dat_max))$coef
summary(glm(L ~ A0, family=Gamma(link="log"), data=dat_max))$coef
glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat_max)
summary(glmA1)$coef
```
Indeed, all the parameters are close to their correct values.

We can also use inverse probability weighting to check the causal 
relationship for $Y$.
```{r chk}
ps <- fitted(glmA1)
wt <- dat_max$A1/ps + (1-dat_max$A1)/(1-ps)
summary(svyglm(Y ~ A0*A1, design = svydesign(~1, weights=~wt, data = dat_max)))$coef
```

<!-- summary(glm(L ~ A0, family=Gamma(link="log"), data=dat))$coef -->
<!-- glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat) -->
<!-- summary(glmA1)$coef -->
<!-- wts <- predict(glmA1, type="response") -->
<!-- wts[dat$A1 == 0] <- 1 - wts[dat$A1 == 0] -->
<!-- summary(glm(Y ~ A0*A1, weight=1/wts, data=dat)) -->

For the remainder of this vignette, we will only use the first 1,000 entries 
of this dataset (change this to the commented out line to replicate results 
from our paper).  
```{r dat, echo=TRUE}
dat <- dat_max[seq_len(2e3), ]
# dat <- dat_max[seq_len(1e4), ]
```

### Outcome Regression

We start with a naïve outcome regression approach, where we fit a linear 
model for $Y$ regressed on various combinations of $A_0,A_1$ and $L$.  As we can
see, none yield the parameters that interest us.

```{r outcome}
lmY_A0A1 <- lm(Y ~ A0*A1, data=dat)
lmY_A0A1_L <- lm(Y ~ A0*A1 + L, data=dat)
lmY_A0A1L <- lm(Y ~ A0*A1*L, data=dat)
summary(lmY_A0A1)$coef
summary(lmY_A0A1_L)$coef
summary(lmY_A0A1L)$coef
```

```{r tab_or, echo=FALSE}
tab_or <- summary(lmY_A0A1)$coef[,1:2]
tab_or <- cbind(tab_or, tab_or[,1] - pars$Y$beta)
colnames(tab_or) <- c("Est.", "SE", "Bias")
```

### Inverse Propensity Weighting

We can try the rather more principled approach of using 
inverse propensity score weighting, and this time the estimates are unbiased. 

```{r ipw}
## get the weights from model for A1
glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat)
ps <- fitted(glmA1)
wt <- dat$A1/ps + (1-dat$A1)/(1-ps) 

lmY_A0A1_w <- svyglm(Y ~ A0*A1, design = svydesign(id=~1, data=dat, weights = ~wt))
summary(lmY_A0A1_w)$coef
```
Notice that the coefficients are now within their standard errors.
```{r tab_ipw, echo=FALSE}
tab_ipw <- summary(lmY_A0A1_w)$coef[,1:2]
tab_ipw <- cbind(tab_ipw, tab_ipw[,1] - pars$Y$beta)
colnames(tab_ipw) <- c("Est.", "SE", "Bias")
```

### Doubly Robust Approach

We can also use an approach based on doubly-robust estimating equations.

```{r dr, eval=TRUE}
## get datasets with different values of A1
dat0 <- dat1 <- dat
dat0$A1 <- 0
dat1$A1 <- 1

## get outcome models
glmY <- lm(Y ~ A0+A1+I(log(L)), data=dat)
q <- predict(glmY, dat)
q0 <- predict(glmY, dat0)
q1 <- predict(glmY, dat1)

n0 <- sum(dat$A0 == 0)
n1 <- sum(dat$A0 == 1)

## weights
w1 <- fitted(glmA1)
w1[dat$A1==0] <- 1 - w1[dat$A1==0]
w0 <- rep(1, nrow(dat))
# w0 <- predict(glmA0, dat, "response")
# w0[dat$A0==0] <- 1 - w0[dat$A0==0]
w <- w0 * w1


## obtain E[Y | do(A0=a0,A1=a1)] for each (a0,a1)
wts01 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 0]
wts00 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 0]
wts11 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 1]
wts10 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 1]
se00 <- sd(wts00)/sqrt(n0)
se10 <- sd(wts10)/sqrt(n1)
se01 <- sd(wts01)/sqrt(n0)
se11 <- sd(wts11)/sqrt(n1)

cse00_01 <- mean((wts00 - mean(wts00))*(wts01 - mean(wts01)))/n0
cse10_11 <- mean((wts10 - mean(wts10))*(wts11 - mean(wts11)))/n1

## use these to obtain estimates, standard errors and bias
est <- c(mean(wts00), mean(wts10) - mean(wts00), 
         mean(wts01 - wts00), mean(wts11 - wts10) - mean(wts01 - wts00))
se <- c(se00, 
        sqrt(se10^2 - 2*cse00_01 + se00^2), 
        sqrt(se01^2 + se00^2),
        sqrt(se10^2 - 2*cse00_01 + se00^2) + sqrt(se10^2 - 2*cse10_11 + se11^2))

bias <- est - pars$Y$beta
tab_dr <- cbind(est, se, bias)
rownames(tab_dr) <- rownames(tab_ipw)
colnames(tab_dr) <- c("Est.", "SE", "Bias")
tab_dr
```

### Maximum Likelihood

Finally, we can fit using our own code with the black-box 
optimizer, and since we are fitting the correct model it is
guaranteed to be consistent and asymptotically efficient.

```{r, echo=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  # eval = FALSE,
  # echo = FALSE,
  comment = "#>",
  cache = FALSE,
  fig.width=7, 
  fig.height=7
)
```

```{r mle, cache=TRUE}
modY <- fitCausal(dat, formulas = list(Y ~ A0*A1, L ~ A0, ~ A0*A1),
                  family = c(1,3,1), control=list(maxit=2e4, newton=TRUE))
modY
```

```{r tab_mle, echo=FALSE, cache=TRUE}
tab_mle <- cbind(modY$pars$Y$beta[1:4], modY$pars$Y$beta_sandwich[1:4],
                 modY$pars$Y$beta[1:4]-pars$Y$beta)
colnames(tab_mle) <- c("Est.", "SE", "Bias")
```


### Comparison of Results

Outcome regression fails miserably, but this is to be expected because the model
is hopelessly misspecified.  IP weighting, double robust estimates and the MLE
all appear to be correct.  

```{r results, echo=FALSE, results="asis"}
results <- cbind(tab_or, tab_ipw, tab_dr, tab_mle)
results[,1+rep(0:3, each=1)*3] <- round(results[,1+rep(0:3, each=1)*3], 2)
results[,2+rep(0:3, each=1)*3] <- round(results[,2+rep(0:3, each=1)*3], 3)
results[,3+(0:3)*3] <- round(results[,3+(0:3)*3], 3)
results[,ncol(results)] <- round(results[,ncol(results)], 3)
kableExtra::kbl(results, booktabs=TRUE, format="html") %>%  ## change to format="latex" for paper
  kableExtra::add_header_above(c(" ","Outcome Regression"=3,"IP Weighting"=3,"Double Robust"=3,"MLE"=3))
```


The code below (which we do not run because it takes over a minute) will allow 
you to compare $N=10^3$ results each with sample size $n=250$ via a naïve 
regression and inverse probability weighting.

```{r params2, echo=FALSE, eval=FALSE}
pars <- list(A0 = list(beta = 0),
             L = list(beta = c(0,-1), phi=1),
             A1 = list(beta = c(0,0.5,0.5,0)), 
             Y = list(beta = c(-1,0.5,0.5,0), phi=1),
             cop = list(beta = c(1,0.5)))
```
```{r formulas2, echo=FALSE, eval=FALSE}
forms <- list(L ~ A0,
                 list(A0 ~ 1, A1 ~ A0*L),
                 Y ~ A0*A1,
                  ~ A0)
```

```{r mult_sims, eval=FALSE}
set.seed(234)
n <- 250
N <- 1e3
out_ipw <- matrix(NA, N, 4)
colnames(out_ipw) <- c("int", "A0", "A1", "A0_A1")
out_or <- se_ipw <- se_or <- out_ipw
out_mle <- se_mle <- out_ipw

for (i in seq_len(N)) {
  dat <- causalSamp(n, formulas = forms, pars = pars, family = list(3,c(5,5),1,1))
  
  ## get naive estimates
  lm_or <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights = rep(1,nrow(dat)), data=dat)))
  
  out_or[i,] <- lm_or$coef[,1]
  se_or[i,] <- lm_or$coef[,2]
  
  ## get weights for IPW
  glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat)
  tmp <- fitted(glmA1)
  wts <- dat$A1/tmp + (1-dat$A1)/(1-tmp)
  
  ## get IPW estimates
  lm_ipw <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights=wts, data=dat)))
  
  out_ipw[i,] <- lm_ipw$coef[,1]
  se_ipw[i,] <- lm_ipw$coef[,2]
  
  ## get MLEs
  tmp <- fitCausal(dat, formulas = forms[-2], family = list(3,1,1))
  out_mle[i,] <- tmp$pars$Y$beta
  se_mle[i,] <- tmp$pars$Y$beta_sandwich
  
  printCount(i)
}
```


<!-- The code below plots the empirical biases as boxplots. -->
```{r boxplot, eval=FALSE, echo=FALSE}
library(dplyr)
library(tidyr)
library(ggplot2)
out_or <- as_tibble(out_or) 
se_or <- as_tibble(se_or) 
out_ipw <- as_tibble(out_ipw) 
se_ipw <- as_tibble(se_ipw) 
out_mle <- as_tibble(out_mle) 
se_mle <- as_tibble(se_mle) 

# out_ipw %>% colMeans()
# out_or %>% colMeans()
bias_ipw <- out_ipw - rep(pars$Y$beta, each=nrow(out_ipw))
bias_or <- out_or - rep(pars$Y$beta, each=nrow(out_or))
bias_mle <- out_mle - rep(pars$Y$beta, each=nrow(out_mle))

bias_or <- pivot_longer(bias_or, int:A0_A1, names_to="coef", values_to="bias")
bias_or <- bias_or %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1")))
bias_ipw <- pivot_longer(bias_ipw, int:A0_A1, names_to="coef", values_to="bias")
bias_ipw <- bias_ipw %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1")))
bias_mle <- pivot_longer(bias_mle, int:A0_A1, names_to="coef", values_to="bias")
bias_mle <- bias_mle %>% mutate(coef=factor(coef, levels=c("int", "A0", "A1","A0_A1")))

ggplot(bias_ipw, aes(x=coef, y=bias, fill=coef)) +
  geom_boxplot() + theme_bw() + 
  theme(legend.position = "none",
        axis.text = element_text(size=12), 
        axis.title = element_text(size=12)) + 
  xlab("coefficient") + ylim(c(-0.45,0.45)) + 
  scale_x_discrete(labels=c("intercept","A","B","A*B"))

ggplot(bias_or, aes(x=coef, y=bias, fill=coef)) +
  geom_boxplot() + theme_bw() + 
  theme(legend.position = "none",
        axis.text = element_text(size=12), 
        axis.title = element_text(size=12)) + 
  xlab("coefficient") + ylim(c(-0.45,0.45)) + 
  scale_x_discrete(labels=c("intercept","A","B","A*B"))

ggplot(bias_mle, aes(x=coef, y=bias, fill=coef)) +
  geom_boxplot() + theme_bw() + 
  theme(legend.position = "none",
        axis.text = element_text(size=12), 
        axis.title = element_text(size=12)) + 
  xlab("coefficient") + ylim(c(-0.45,0.45)) + 
  scale_x_discrete(labels=c("intercept","A","B","A*B"))

```