As always, we begin by loading the package.
We first select the variables in our model, choosing the ones mentioned in the running example of Evans and Didelez (2021). In this case the model is given by the graph A0 → L → A1 → Y with A0 → A1 and L ↔︎ Y.
We next select the parameters for our model, again following Evans and Didelez (2024).
pars <- list(A0 = list(beta = 0),
L = list(beta = c(0.3,-0.2), phi=1),
A1 = list(beta = c(-0.3,0.4,0.3,0)),
Y = list(beta = c(-0.5,0.2,0.3,0), phi=1),
cop = list(beta = c(1,0.5)))
Now we sample 5, 000 observations from the model (you might like to edit this to increase the sample size to 106, replicating the analysis in Evans and Didelez, 2024):
set.seed(123)
n <- 5e3
dat_max <- causalSamp(n, formulas = forms, pars=pars, family = list(3,c(5,5),1,1))
Now we can check that the distribution actually has the correct form for the first three variables (A0, L, A1):
summary(glm(A0 ~ 1, family=binomial, data=dat_max))$coef
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.0216 0.0283 -0.764 0.445
summary(glm(L ~ A0, family=Gamma(link="log"), data=dat_max))$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.273 0.0197 13.84 9.42e-43
#> A0 -0.183 0.0281 -6.54 6.87e-11
glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat_max)
summary(glmA1)$coef
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.322 0.0586 -5.506 3.66e-08
#> A0 0.398 0.0837 4.754 2.00e-06
#> L 0.311 0.0347 8.945 3.72e-19
#> A0:L -0.026 0.0552 -0.472 6.37e-01
Indeed, all the parameters are close to their correct values.
We can also use inverse probability weighting to check the causal relationship for Y.
ps <- fitted(glmA1)
wt <- dat_max$A1/ps + (1-dat_max$A1)/(1-ps)
summary(svyglm(Y ~ A0*A1, design = svydesign(~1, weights=~wt, data = dat_max)))$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.4912 0.0296 -16.62 2.09e-60
#> A0 0.1938 0.0450 4.31 1.68e-05
#> A1 0.2543 0.0407 6.24 4.64e-10
#> A0:A1 0.0737 0.0593 1.24 2.14e-01
For the remainder of this vignette, we will only use the first 1,000 entries of this dataset (change this to the commented out line to replicate results from our paper).
We start with a naïve outcome regression approach, where we fit a linear model for Y regressed on various combinations of A0, A1 and L. As we can see, none yield the parameters that interest us.
lmY_A0A1 <- lm(Y ~ A0*A1, data=dat)
lmY_A0A1_L <- lm(Y ~ A0*A1 + L, data=dat)
lmY_A0A1L <- lm(Y ~ A0*A1*L, data=dat)
summary(lmY_A0A1)$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.62211 0.0460 -13.5190 6.52e-40
#> A0 0.22298 0.0674 3.3105 9.48e-04
#> A1 0.44969 0.0632 7.1164 1.54e-12
#> A0:A1 0.00593 0.0892 0.0665 9.47e-01
summary(lmY_A0A1_L)$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.0403 0.0436 -23.841 9.56e-111
#> A0 0.2604 0.0589 4.418 1.05e-05
#> A1 0.2849 0.0557 5.118 3.38e-07
#> L 0.3978 0.0160 24.794 1.59e-118
#> A0:A1 0.0588 0.0780 0.754 4.51e-01
summary(lmY_A0A1L)$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -1.0370 0.0563 -18.435 3.39e-70
#> A0 0.0776 0.0820 0.946 3.44e-01
#> A1 0.4825 0.0775 6.230 5.69e-10
#> L 0.3946 0.0379 10.417 8.88e-25
#> A0:A1 -0.0655 0.1092 -0.600 5.49e-01
#> A0:L 0.1906 0.0579 3.291 1.02e-03
#> A1:L -0.1340 0.0459 -2.920 3.54e-03
#> A0:A1:L 0.0322 0.0693 0.464 6.43e-01
We can try the rather more principled approach of using inverse propensity score weighting, and this time the estimates are unbiased.
## get the weights from model for A1
glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat)
ps <- fitted(glmA1)
wt <- dat$A1/ps + (1-dat$A1)/(1-ps)
lmY_A0A1_w <- svyglm(Y ~ A0*A1, design = svydesign(id=~1, data=dat, weights = ~wt))
summary(lmY_A0A1_w)$coef
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.5454 0.0491 -11.102 7.90e-28
#> A0 0.2452 0.0738 3.323 9.08e-04
#> A1 0.3155 0.0649 4.859 1.27e-06
#> A0:A1 -0.0161 0.0940 -0.171 8.64e-01
Notice that the coefficients are now within their standard errors.
We can also use an approach based on doubly-robust estimating equations.
## get datasets with different values of A1
dat0 <- dat1 <- dat
dat0$A1 <- 0
dat1$A1 <- 1
## get outcome models
glmY <- lm(Y ~ A0+A1+I(log(L)), data=dat)
q <- predict(glmY, dat)
q0 <- predict(glmY, dat0)
q1 <- predict(glmY, dat1)
n0 <- sum(dat$A0 == 0)
n1 <- sum(dat$A0 == 1)
## weights
w1 <- fitted(glmA1)
w1[dat$A1==0] <- 1 - w1[dat$A1==0]
w0 <- rep(1, nrow(dat))
# w0 <- predict(glmA0, dat, "response")
# w0[dat$A0==0] <- 1 - w0[dat$A0==0]
w <- w0 * w1
## obtain E[Y | do(A0=a0,A1=a1)] for each (a0,a1)
wts01 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 0]
wts00 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 0]
wts11 <- ((dat$Y - q)*dat$A1/w + q1)[dat$A0 == 1]
wts10 <- ((dat$Y - q)*(1-dat$A1)/w + q0)[dat$A0 == 1]
se00 <- sd(wts00)/sqrt(n0)
se10 <- sd(wts10)/sqrt(n1)
se01 <- sd(wts01)/sqrt(n0)
se11 <- sd(wts11)/sqrt(n1)
cse00_01 <- mean((wts00 - mean(wts00))*(wts01 - mean(wts01)))/n0
cse10_11 <- mean((wts10 - mean(wts10))*(wts11 - mean(wts11)))/n1
## use these to obtain estimates, standard errors and bias
est <- c(mean(wts00), mean(wts10) - mean(wts00),
mean(wts01 - wts00), mean(wts11 - wts10) - mean(wts01 - wts00))
se <- c(se00,
sqrt(se10^2 - 2*cse00_01 + se00^2),
sqrt(se01^2 + se00^2),
sqrt(se10^2 - 2*cse00_01 + se00^2) + sqrt(se10^2 - 2*cse10_11 + se11^2))
bias <- est - pars$Y$beta
tab_dr <- cbind(est, se, bias)
rownames(tab_dr) <- rownames(tab_ipw)
colnames(tab_dr) <- c("Est.", "SE", "Bias")
tab_dr
#> Est. SE Bias
#> (Intercept) -0.5510 0.0450 -0.0510
#> A0 0.2464 0.0620 0.0464
#> A1 0.3283 0.0606 0.0283
#> A0:A1 -0.0218 0.1148 -0.0218
Finally, we can fit using our own code with the black-box optimizer, and since we are fitting the correct model it is guaranteed to be consistent and asymptotically efficient.
modY <- fitCausal(dat, formulas = list(Y ~ A0*A1, L ~ A0, ~ A0*A1),
family = c(1,3,1), control=list(maxit=2e4, newton=TRUE))
#> Warning: `fitCausal()` was deprecated in causl 0.8.8.
#> ℹ Please use `fit_causl()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> Newton's algorithm failed, exiting
modY
#> log-likelihood: -4766
#> Y ~ A0 * A1
#> est. s.e. sandwich
#> (intercept) -0.54986 0.0427 0.0429
#> A0 0.23261 0.0599 0.0606
#> A1 0.31747 0.0559 0.0555
#> A0:A1 -0.00193 0.0744 0.0743
#> residual s.e.: 0.97 0.0307 0.0308
#>
#> L ~ A0
#> est. s.e. sandwich
#> (intercept) 0.242 0.0328 0.0326
#> A0 -0.124 0.0454 0.0450
#> residual s.e.: 1.05 0.029 0.0302
#>
#> copula parameters:
#> cop ~ A0 + A1 + A0:A1
#> cop_Y_L:
#> est. s.e. sandwich
#> (intercept) 1.105 0.0895 0.0851
#> A0 0.321 0.1226 0.1187
#> A1 -0.125 0.1180 0.1172
#> A0:A1 0.259 0.1604 0.1582
Outcome regression fails miserably, but this is to be expected because the model is hopelessly misspecified. IP weighting, double robust estimates and the MLE all appear to be correct.
Est. | SE | Bias | Est. | SE | Bias | Est. | SE | Bias | Est. | SE | Bias | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
(Intercept) | -0.62 | 0.046 | -0.122 | -0.55 | 0.049 | -0.045 | -0.55 | 0.045 | -0.051 | -0.55 | 0.043 | -0.050 |
A0 | 0.22 | 0.067 | 0.023 | 0.25 | 0.074 | 0.045 | 0.25 | 0.062 | 0.046 | 0.23 | 0.061 | 0.033 |
A1 | 0.45 | 0.063 | 0.150 | 0.32 | 0.065 | 0.015 | 0.33 | 0.061 | 0.028 | 0.32 | 0.055 | 0.017 |
A0:A1 | 0.01 | 0.089 | 0.006 | -0.02 | 0.094 | -0.016 | -0.02 | 0.115 | -0.022 | 0.00 | 0.074 | -0.002 |
The code below (which we do not run because it takes over a minute) will allow you to compare N = 103 results each with sample size n = 250 via a naïve regression and inverse probability weighting.
set.seed(234)
n <- 250
N <- 1e3
out_ipw <- matrix(NA, N, 4)
colnames(out_ipw) <- c("int", "A0", "A1", "A0_A1")
out_or <- se_ipw <- se_or <- out_ipw
out_mle <- se_mle <- out_ipw
for (i in seq_len(N)) {
dat <- causalSamp(n, formulas = forms, pars = pars, family = list(3,c(5,5),1,1))
## get naive estimates
lm_or <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights = rep(1,nrow(dat)), data=dat)))
out_or[i,] <- lm_or$coef[,1]
se_or[i,] <- lm_or$coef[,2]
## get weights for IPW
glmA1 <- glm(A1 ~ A0*L, family=binomial, data=dat)
tmp <- fitted(glmA1)
wts <- dat$A1/tmp + (1-dat$A1)/(1-tmp)
## get IPW estimates
lm_ipw <- summary(svyglm(Y ~ A0*A1, design=svydesign(~1, weights=wts, data=dat)))
out_ipw[i,] <- lm_ipw$coef[,1]
se_ipw[i,] <- lm_ipw$coef[,2]
## get MLEs
tmp <- fitCausal(dat, formulas = forms[-2], family = list(3,1,1))
out_mle[i,] <- tmp$pars$Y$beta
se_mle[i,] <- tmp$pars$Y$beta_sandwich
printCount(i)
}