Methods for use with discovery GWAS

library(winnerscurse)

This vignette demonstrates how our R package, winnerscurse, can be used to obtain new adjusted association estimates for each SNP when only the summary statistics from a single genetic association study, namely the discovery GWAS, are available. In order to do so, we create a toy data set using the function sim_stats, which is included in the package, and subsequently, illustrate how a user could engage with each of the package’s Winner’s Curse correction functions using this data set.

The methods currently available for implementation are:

  1. Conditional Likelihood methods - Ghosh et al. (2008)
  2. Empirical Bayes method - Ferguson et al. (2013)
  3. FDR Inverse Quantile Transformation method - Bigdeli et al. (2016)
  4. Bootstrap method - Forde et al. (2023)

An important feature which distinguishes these four methods is detailed in the table below:

Adjusts β estimate of only SNPs with p-values less than specified threshold, α Adjusts β estimate of all SNPs
  • Conditional Likelihood methods
  • Empirical Bayes method
  • FDR Inverse Quantile Transformation method
  • Bootstrap method

Creating a toy data set

set.seed(1998)

sim_dataset <- sim_stats(nsnp=10^6,h2=0.4,prop_effect=0.01,nid=50000)

## simulated GWAS summary statistics
summary_stats <- sim_dataset$disc
head(summary_stats)
#>   rsid         beta          se
#> 1    1 -0.011423267 0.019222517
#> 2    2  0.002883483 0.009187098
#> 3    3 -0.043813790 0.016459133
#> 4    4 -0.018164924 0.019560463
#> 5    5  0.021349091 0.006335237
#> 6    6 -0.001559945 0.007697227

Method 1: Conditional Likelihood

out_CL <- conditional_likelihood(summary_data = summary_stats, alpha = 5e-8) 
head(out_CL)
#>   rsid        beta          se    beta.cl1    beta.cl2    beta.cl3
#> 1 7815  0.04771727 0.006568299  0.04709173  0.04575084  0.04642129
#> 2 3965  0.04609999 0.006346232  0.04549476  0.04419814  0.04484645
#> 3 4998 -0.04527792 0.006458823 -0.04421716 -0.04238468 -0.04330092
#> 4 9917  0.04164616 0.006334188  0.03909824  0.03598923  0.03754373
#> 5 7261  0.04162686 0.006343244  0.03900942  0.03584977  0.03742959
#> 6 6510  0.04254046 0.006500057  0.03975867  0.03645266  0.03810567

Method 2: Empirical Bayes

out_EB <- empirical_bayes(summary_data = summary_stats)
head(out_EB)
#>   rsid        beta          se     beta_EB
#> 1 7815  0.04771727 0.006568299  0.04599530
#> 2 3965  0.04609999 0.006346232  0.04443623
#> 3 4998 -0.04527792 0.006458823 -0.03999539
#> 4 9917  0.04164616 0.006334188  0.03998556
#> 5 7261  0.04162686 0.006343244  0.03996389
#> 6 6510  0.04254046 0.006500057  0.04083638

Method 3: FDR Inverse Quantile Transformation

out_FIQT <- FDR_IQT(summary_data = summary_stats)
head(out_FIQT)
#>   rsid        beta          se   beta_FIQT
#> 1 7815  0.04771727 0.006568299  0.03422825
#> 2 3965  0.04609999 0.006346232  0.03307103
#> 3 4998 -0.04527792 0.006458823 -0.03188772
#> 4 9917  0.04164616 0.006334188  0.02823824
#> 5 7261  0.04162686 0.006343244  0.02827861
#> 6 6510  0.04254046 0.006500057  0.02897770

Method 4: Bootstrap

 

Brief method description:

out_BR_ss <- BR_ss(summary_data = summary_stats, seed_opt=TRUE, seed=2020)
head(out_BR_ss)
#>   rsid        beta          se  beta_BR_ss
#> 1 7815  0.04771727 0.006568299  0.03628244
#> 2 3965  0.04609999 0.006346232  0.03505042
#> 3 4998 -0.04527792 0.006458823 -0.02990042
#> 4 9917  0.04164616 0.006334188  0.02915571
#> 5 7261  0.04162686 0.006343244  0.02909271
#> 6 6510  0.04254046 0.006500057  0.02965867

Comparing Results

As we have simulated our data set, we have the ability to quantify the amount of bias present in the original naive β estimates and also, in our adjusted estimates for each method. Thus, we can briefly examine how each correction method performed with respect to our simulated data set. Similar to Forde et al. (2023), assessment can take place using measures such as the estimated root mean square error of significant SNPs (rmse) and the estimated average bias over all significant SNPs (bias). These two evaluation metrics can be be mathematically defined as follows, in which there are Nsig significant SNPs:

Note: This average bias will be computed separately for significant SNPs with positive and negative true effects to avoid the bias in either direction simply cancelling each other out.

Note: We compare our methods based on their performance with respect to SNPs that pass a certain threshold as these are the SNPs that have association estimates affected by Winner’s Curse and most often, it is the effect sizes of these SNPs that researchers are most interested in.

We consider a significance threshold of α = 5 × 10−8 and subset the outputted data frames as required.

## Simulated true effect sizes:
true_beta <- sim_dataset$true$true_beta 

## Subset required data sets:
out_EB_sig <- out_EB[2*pnorm(abs(out_EB$beta/out_EB$se), lower.tail = FALSE) < 5e-8,]
out_FIQT_sig <- out_FIQT[2*pnorm(abs(out_FIQT$beta/out_FIQT$se), lower.tail=FALSE) < 5e-8,]
out_BR_ss_sig <- out_BR_ss[2*pnorm(abs(out_BR_ss$beta/out_BR_ss$se), lower.tail=FALSE) < 5e-8,]

The estimated root mean square error of significant SNPs for each method is computed below. It can be seen that all methods demonstrate improvements over merely using the naive unadjusted association estimates as the rmse values for each method are lower than that of naive. The bootstrap method provides the lowest value for rmse.

## rmse
rmse <- data.frame(naive = sqrt(mean((out_CL$beta - true_beta[out_CL$rsid])^2)), CL1 = sqrt(mean((out_CL$beta.cl1 - true_beta[out_CL$rsid])^2)), CL2 = sqrt(mean((out_CL$beta.cl2 - true_beta[out_CL$rsid])^2)), CL3= sqrt(mean((out_CL$beta.cl3 - true_beta[out_CL$rsid])^2)),EB = sqrt(mean((out_EB_sig$beta_EB - true_beta[out_EB_sig$rsid])^2)), FIQT = sqrt(mean((out_FIQT_sig$beta_FIQT - true_beta[out_FIQT_sig$rsid])^2)), boot= sqrt(mean((out_BR_ss_sig$beta_BR_ss - true_beta[out_BR_ss_sig$rsid])^2)))
rmse
#>        naive       CL1         CL2        CL3         EB        FIQT
#> 1 0.01528707 0.0165505 0.009738775 0.01285767 0.01217377 0.008195889
#>          boot
#> 1 0.008379167

The next metric, the average bias over all significant SNPs with positive association estimates, is included below. Here, we wish to obtain values closer to zero than the naive approach and it can be seen that all methods achieve this.

## bias positive
pos_sig <- out_CL$rsid[out_CL$beta > 0]
pos_sigA <- which(out_CL$rsid %in% pos_sig)

bias_up <- data.frame(naive = mean(out_CL$beta[pos_sigA] - true_beta[pos_sig]), CL1 = mean(out_CL$beta.cl1[pos_sigA] - true_beta[pos_sig]), CL2 = mean(out_CL$beta.cl2[pos_sigA] - true_beta[pos_sig]), CL3= mean(out_CL$beta.cl3[pos_sigA] - true_beta[pos_sig]),EB = mean(out_EB_sig$beta_EB[pos_sigA] - true_beta[pos_sig]), FIQT = mean(out_FIQT_sig$beta_FIQT[pos_sigA] - true_beta[pos_sig]), boot= mean(out_BR_ss_sig$beta_BR_ss[pos_sigA] - true_beta[pos_sig]))
bias_up
#>        naive          CL1          CL2          CL3         EB         FIQT
#> 1 0.01268454 -0.003919891 -0.000829573 -0.002374732 0.01037972 -0.002254763
#>           boot
#> 1 -0.001818847

In a similar manner, the average bias over all significant SNPs with negative association estimates, is computed. Again, we see that the application of all methods result in, on average, less biased association estimates for these SNPs, as desired.

## bias negative
neg_sig <- out_CL$rsid[out_CL$beta < 0]
neg_sigA <- which(out_CL$rsid %in% neg_sig)

bias_down <- data.frame(naive = mean(out_CL$beta[neg_sigA] - true_beta[neg_sig]), CL1 = mean(out_CL$beta.cl1[neg_sigA] - true_beta[neg_sig]), CL2 = mean(out_CL$beta.cl2[neg_sigA] - true_beta[neg_sig]), CL3= mean(out_CL$beta.cl3[neg_sigA] - true_beta[neg_sig]),EB = mean(out_EB_sig$beta_EB[neg_sigA] - true_beta[neg_sig]), FIQT = mean(out_FIQT_sig$beta_FIQT[neg_sigA] - true_beta[neg_sig]), boot= mean(out_BR_ss_sig$beta_BR_ss[neg_sigA] - true_beta[neg_sig]))
bias_down
#>         naive         CL1         CL2         CL3           EB        FIQT
#> 1 -0.01250495 0.005887153 0.002532757 0.004209955 -0.006742937 0.002466867
#>          boot
#> 1 0.003934355

Note: In the above results, it is seen that the empirical Bayes method doesn’t seem to be performing as well as the other methods, i.e. the reductions in rmse and bias are small in comparison. The reason for this is perhaps that it is the original form of the empirical Bayes method that has been used here. It is possible that using another variation of the method, for example setting method="gam_nb" or method="scam", would result in improvements, as shown in Forde et al. (2023).

To complement the above, we provide an illustration of boxplots obtained for the root mean square error of significant SNPs (RMSE of sig. SNPs at 5 × 10−8), plotted for each Winner’s Curse correction method (Method).

library(RColorBrewer)
library(ggplot2)
col <- brewer.pal(8,"Dark2")

all_results <- data.frame(rsid = c(rep(out_CL$rsid,7)), beta = c(rep(out_CL$beta,7)), se = c(rep(out_CL$se,7)), adj_beta = c(out_CL$beta,out_CL$beta.cl1,out_CL$beta.cl2,out_CL$beta.cl3,out_EB_sig$beta_EB,out_FIQT_sig$beta_FIQT,out_BR_ss_sig$beta_BR_ss), method = c(rep("naive",33),rep("CL1",33),rep("CL2",33),rep("CL3",33),rep("EB",33),rep("FIQT",33),rep("boot",33)))
all_results$rmse <- sqrt((all_results$adj_beta - true_beta[all_results$rsid])^2)
all_results$method <- factor(all_results$method, levels=c("naive","CL1", "CL2", "CL3", "EB", "boot", "FIQT"))

ggplot(all_results,aes(x=method,y=rmse,fill=method, color=method)) + geom_boxplot(size=0.7,aes(fill=method, color=method, alpha=0.2)) + scale_fill_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7])) + scale_color_manual(values=c(col[1],col[2],col[3],col[4],col[5],col[6],col[7]))  + xlab("Method") +
  ylab(expression(paste("RMSE of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() + geom_hline(yintercept=0, colour="black") +  theme(axis.text.x=element_text(angle=90, vjust=0.5, hjust=1),text = element_text(size=12),legend.position = "none", strip.text = element_text(face="italic")) 

In addition, we can gain a visual insight into the adjustments made by these functions by constructing a density plot with the adjusted absolute values along with the naive estimates and the true absolute β values of significant SNPs, as follows:

true <- data.frame(rsid = c(out_CL$rsid), beta=c(out_CL$beta), se=c(out_CL$se),adj_beta=true_beta[out_CL$rsid],method=c(rep("true",33)))
all_resultsA <- rbind(all_results[,1:5],true)

ggplot(all_resultsA, aes(x=abs(adj_beta),colour=method,fill=method)) + geom_density(alpha=0.05,size=1) + scale_color_brewer(palette="Dark2") + scale_fill_brewer(palette="Dark2") + xlim(-0.01,0.08) +
  ylab("Density") + xlab(expression(paste("|", beta, "| " , "of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() +  theme(text = element_text(size=12)) + labs(fill = "Method",colour="Method")

   

Application of methods to a real data set

In this final section, we will provide a short demonstration of applying these Winner’s Curse correction methods to a real data set.

Briefly, the data set we will use is a subset of the first BMI data set used in Forde et al. (2023), which was obtained from the UK Biobank. In order to obtain a smaller data set suitable for demonstration purposes in this vignette, LD clumping was performed on the data set used in the aforementioned paper using the ld_clump() function from the ieugwasr R package. The LD (r-squared) threshold for clumping was set to 0.01 while the physical distance threshold used was 100 kb. This procedure provided a data set of 272,604 SNPs with corresponding association estimates (beta) and standard errors (se) as well as association estimates obtained in an independent replication GWAS of a similar size (beta_rep). The first six rows of this data set are outputted below.

BMI_UKBB <- read.csv("https://raw.githubusercontent.com/amandaforde/winnerscurse_realdata/main/data/summary_stats_BMI.txt",quote = "", row.names = NULL, col.names = c("chr","pos","rsid","beta","se","beta_rep"), stringsAsFactors = FALSE) 

head(BMI_UKBB)
#>   chr    pos          rsid        beta        se    beta_rep
#> 1   1 801536  "rs79373928"  0.00691695 0.0672237 -0.00845073
#> 2   1 809876  "rs57181708"  0.08085120 0.0271055  0.05181780
#> 3   1 815656 "rs557970564"  0.16602900 0.0887715 -0.13927300
#> 4   1 834430 "rs533316052" -0.06663160 0.0685312 -0.05314070
#> 5   1 837657 "rs149737509" -0.00116905 0.0623581 -0.06175060
#> 6   1 852758   "rs4970462" -0.02576140 0.0200347 -0.02956890

We apply the correction methods to this data set as follows and obtain adjusted association estimates for significant SNPs at the genome-wide significance threshold of 5 × 10−8.

BMI_data <- BMI_UKBB[,3:5]

out_CL_BMI <- conditional_likelihood(summary_data = BMI_data, alpha = 5e-8)
out_EB_BMI <- empirical_bayes(summary_data = BMI_data)
out_FIQT_BMI <- FDR_IQT(summary_data = BMI_data)
out_boot_BMI <- BR_ss(summary_data = BMI_data, seed=2020)

## Subset required data sets:
out_EB_BMI_sig <- out_EB_BMI[2*pnorm(abs(out_EB_BMI$beta/out_EB_BMI$se), lower.tail = FALSE) < 5e-8,]
out_FIQT_BMI_sig <- out_FIQT_BMI[2*pnorm(abs(out_FIQT_BMI$beta/out_FIQT_BMI$se), lower.tail=FALSE) < 5e-8,]
out_boot_BMI_sig <- out_boot_BMI[2*pnorm(abs(out_boot_BMI$beta/out_boot_BMI$se), lower.tail=FALSE) < 5e-8,]

Note: In the case of real data, the true association values are unknown. Therefore, in order to evaluate the performance of methods, we will make use of beta_rep, the association estimates obtained in an independent replication GWAS of a similar size. These association estimates are considered to be unbiased, and thus can be used to obtain an expression for the estimated mean square error (MSE) of significant SNPs. This expression is given in Eq [14] of Forde et al. (2023) and can be computed for each method as shown below. It can be seen that each method provides an estimated MSE value lower than that of the naive approach of using the unadjusted association estimates from the discovery GWAS.

## mse
beta_rep <- BMI_UKBB$beta_rep[BMI_UKBB$rsid %in% out_CL_BMI$rsid]

mse <- data.frame(naive = mean((beta_rep - out_CL_BMI$beta)^2) - mean(out_CL_BMI$se^2), CL1 = mean((beta_rep - out_CL_BMI$beta.cl1)^2) - mean(out_CL_BMI$se^2), CL2 = mean((beta_rep - out_CL_BMI$beta.cl2)^2) - mean(out_CL_BMI$se^2), CL3 = mean((beta_rep - out_CL_BMI$beta.cl3)^2) - mean(out_CL_BMI$se^2), EB = mean((beta_rep - out_EB_BMI_sig$beta_EB)^2) - mean(out_EB_BMI_sig$se^2), boot = mean((beta_rep - out_boot_BMI_sig$beta_BR_ss)^2) - mean(out_boot_BMI_sig$se^2), FIQT = mean((beta_rep - out_FIQT_BMI_sig$beta_FIQT)^2) - mean(out_FIQT_BMI_sig$se^2))

mse
#>        naive        CL1        CL2        CL3         EB       boot       FIQT
#> 1 0.03608036 0.02946674 0.02870324 0.02897353 0.03029724 0.02874184 0.02859518

For illustration purposes, we again simply produce a density plot with the adjusted absolute values along with the naive estimates and the true absolute β values of significant SNPs.

all_results <- data.frame(rsid = c(rep(out_CL_BMI$rsid,7)), beta = c(rep(out_CL_BMI$beta,7)), se = c(rep(out_CL_BMI$se,7)), adj_beta = c(out_CL_BMI$beta,out_CL_BMI$beta.cl1,out_CL_BMI$beta.cl2,out_CL_BMI$beta.cl3,out_EB_BMI_sig$beta_EB,out_FIQT_BMI_sig$beta_FIQT,out_boot_BMI_sig$beta_BR_ss), method = c(rep("naive",214),rep("CL1",214),rep("CL2",214),rep("CL3",214),rep("EB",214),rep("FIQT",214),rep("boot",214)))
all_results$method <- factor(all_results$method, levels=c("naive","CL1", "CL2", "CL3", "EB", "boot", "FIQT"))
rep <- data.frame(rsid = c(out_CL_BMI$rsid), beta=c(out_CL_BMI$beta), se=c(out_CL_BMI$se),adj_beta=beta_rep,method=c(rep("rep",214)))
all_resultsA <- rbind(all_results[,1:5],rep)

ggplot(all_resultsA, aes(x=abs(adj_beta),colour=method,fill=method)) + geom_density(alpha=0.05,size=1) + scale_color_brewer(palette="Dark2") + scale_fill_brewer(palette="Dark2") + xlim(-0.1,0.4) +
  ylab("Density") + xlab(expression(paste("|", beta, "| " , "of sig. SNPs at  ", 5%*%10^-8))) + theme_bw() +  theme(text = element_text(size=12)) + labs(fill = "Method",colour="Method")