EBI upload pipeline

library(GwasDataImport)

Objectives

  1. Download and format the data and metadata for an EBI dataset
  2. Upload the formatted EBI data and metadata to OpenGWAS API
  3. Determine which datasets in EBI are not present in OpenGWAS. Allow a ‘blacklist’ of EBI datasets to ignore. (TODO)

Complete workflow for one dataset

# library(devtools)
# load_all()
# ieugwasr::select_api("dev2")

# Authorise
ieugwasr::get_access_token()

# This is a small file, good for testing
ebi_id <- "GCST005522"
x <- EbiDataset$new(
    ebi_id = ebi_id, 
    ftp_path = get_ftp_path(ebi_id),
    igd_id = "ebi-a-EBITEST",
    wd = paste0("uploads/", ebi_id)
)
x$pipeline()

Now delete:

x$api_gwasdata_delete()

Running each step individually

ebi_id <- "GCST005522"
x <- EbiDataset$new(
    ebi_id = ebi_id, 
    ftp_path = get_ftp_path(ebi_id),
    igd_id = "ebi-a-EBITEST",
    wd = paste0("uploads/", ebi_id)
)
x$download_dataset()
x$format_ebi_dataset()
x$determine_columns(params=list(chr_col=1, snp_col=2, pos_col=3, oa_col=4, ea_col=5, eaf_col=6, beta_col=7, se_col=8, pval_col=9), gwas_file=x$gwas_out1)
x$format_dataset(gwas_file=x$gwas_out1)
x$organise_metadata()
x$api_metadata_upload()
x$api_gwasdata_upload()
x$api_gwasdata_delete()

Workflow for all datasets

To get a list of datasets that need to be imported:

ignore_list <- scan(system.file(package="EbiDataImport", "extdata/ebi_ignore_list.txt"), character())
newdats <- determine_new_datasets(blacklist=ignore_list)
newdats <- being_processed(newdats) %>% subset(., need)
ignore <- list()
for(i in 2:nrow(newdats))
{
    message(newdats$ebi_id[i])
    x <- EbiDataset$new(
        ebi_id = newdats$ebi_id[i], 
        ftp_path = newdats$path[i],
        wd = paste0("uploads/", newdats$ebi_id[i])
    )
    o <- x$pipeline()
    if(! "NULL" %in% class(o))
    {
        ignore[[newdats$ebi_id[i]]] <- o
    }
    rm(x)
}

Lookup build

x <- EbiDataset$new("GCST000755", ftp_path=get_ftp_path("GCST000755"))
x$download_dataset()
x$format_dataset()

Suhre proteins

ignore_list <- scan(system.file(package="EbiDataImport", "extdata/ebi_ignore_list.txt"), character())
newdats <- determine_new_datasets(blacklist=ignore_list, exclude = FALSE)
suhre <- subset(newdats, grepl("Suhr", path))
suhre <- separate(suhre, path, sep="/", into=c("p1","p2","p3"), remove=F)
temp <- do.call(rbind, strsplit(suhre$p3, split="_"))
suhre$igd_id <- paste0("prot-c-", temp[,1], "_", temp[,2])
suhre$igd_id

traitinfo <- data.table::fread("http://metabolomics.helmholtz-muenchen.de/pgwas/download/probeanno.tsv", sep="\t")
traitinfo$igd_id <- paste0("prot-c-", traitinfo$seqid)

suhre <- merge(suhre, traitinfo, by="igd_id")
save(suhre, file="suhre.rdata")

load("suhre.rdata")
for(i in 1:nrow(suhre))
{
    x <- EbiDataset$new(
        ebi_id=suhre$ebi_id[i],
        ftp_path=suhre$path[i],
        igd_id=suhre$igd_id[i],
        traitname=suhre$target[i]
    )
    x$download_dataset()
    x$pipeline()
}

Conclusion - they seem to be missing standard errors? Need to do this through the batch processing.

EBI Ignore list

Need to figure out a solution for storing this. Many EBI datasets don’t have sufficient info to be included, so we need to ignore them instead of trying to upload them each time we look for new studies.

On the ieu-ssd machine where the original set of EBI datasets was downloaded and processed, this is how the first ignore list was determined:

library(dplyr)
a <- read.csv("/data/ebi_gwascatalog/pmids.txt")
b <- scan("/data/ebi_gwascatalog/study_ids.txt", character())
got <- ieugwasr::gwasinfo() %>% subset(., grepl("ebi-a", id)) %>% {.$id} %>% gsub("ebi-a-", "", .)
a$got <- a$StudyID %in% got
head(a)
table(a$got)
table(a$got, a$Status)
ebi_ignore_list <- a$StudyID[a$Status == "harmonised" & !a$got]
write.table(ebi_ignore_list, file="ebi_ignore_list.txt", row=F, col=F, qu=F)