--- title: EBI upload pipeline output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{EBI upload pipeline} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval=FALSE ) ``` ```{r setup} library(GwasDataImport) ``` ## Objectives 1. Download and format the data and metadata for an EBI dataset 2. Upload the formatted EBI data and metadata to OpenGWAS API 3. Determine which datasets in EBI are not present in OpenGWAS. Allow a 'blacklist' of EBI datasets to ignore. (TODO) ## Complete workflow for one dataset ```{r, eval=FALSE} # library(devtools) # load_all() # ieugwasr::select_api("dev2") # Authorise ieugwasr::get_access_token() # This is a small file, good for testing ebi_id <- "GCST005522" x <- EbiDataset$new( ebi_id = ebi_id, ftp_path = get_ftp_path(ebi_id), igd_id = "ebi-a-EBITEST", wd = paste0("uploads/", ebi_id) ) x$pipeline() ``` Now delete: ```{r, eval=FALSE} x$api_gwasdata_delete() ``` ## Running each step individually ```{r, eval=FALSE} ebi_id <- "GCST005522" x <- EbiDataset$new( ebi_id = ebi_id, ftp_path = get_ftp_path(ebi_id), igd_id = "ebi-a-EBITEST", wd = paste0("uploads/", ebi_id) ) x$download_dataset() x$format_ebi_dataset() x$determine_columns(params=list(chr_col=1, snp_col=2, pos_col=3, oa_col=4, ea_col=5, eaf_col=6, beta_col=7, se_col=8, pval_col=9), gwas_file=x$gwas_out1) x$format_dataset(gwas_file=x$gwas_out1) x$organise_metadata() x$api_metadata_upload() x$api_gwasdata_upload() x$api_gwasdata_delete() ``` ## Workflow for all datasets To get a list of datasets that need to be imported: ```{r, eval=FALSE} ignore_list <- scan(system.file(package="EbiDataImport", "extdata/ebi_ignore_list.txt"), character()) newdats <- determine_new_datasets(blacklist=ignore_list) newdats <- being_processed(newdats) %>% subset(., need) ``` ```{r, eval=FALSE} ignore <- list() for(i in 2:nrow(newdats)) { message(newdats$ebi_id[i]) x <- EbiDataset$new( ebi_id = newdats$ebi_id[i], ftp_path = newdats$path[i], wd = paste0("uploads/", newdats$ebi_id[i]) ) o <- x$pipeline() if(! "NULL" %in% class(o)) { ignore[[newdats$ebi_id[i]]] <- o } rm(x) } ``` ## Lookup build ```{r, eval=FALSE} x <- EbiDataset$new("GCST000755", ftp_path=get_ftp_path("GCST000755")) x$download_dataset() x$format_dataset() ``` ## Suhre proteins ```{r, eval=FALSE} ignore_list <- scan(system.file(package="EbiDataImport", "extdata/ebi_ignore_list.txt"), character()) newdats <- determine_new_datasets(blacklist=ignore_list, exclude = FALSE) suhre <- subset(newdats, grepl("Suhr", path)) suhre <- separate(suhre, path, sep="/", into=c("p1","p2","p3"), remove=F) temp <- do.call(rbind, strsplit(suhre$p3, split="_")) suhre$igd_id <- paste0("prot-c-", temp[,1], "_", temp[,2]) suhre$igd_id traitinfo <- data.table::fread("http://metabolomics.helmholtz-muenchen.de/pgwas/download/probeanno.tsv", sep="\t") traitinfo$igd_id <- paste0("prot-c-", traitinfo$seqid) suhre <- merge(suhre, traitinfo, by="igd_id") save(suhre, file="suhre.rdata") load("suhre.rdata") for(i in 1:nrow(suhre)) { x <- EbiDataset$new( ebi_id=suhre$ebi_id[i], ftp_path=suhre$path[i], igd_id=suhre$igd_id[i], traitname=suhre$target[i] ) x$download_dataset() x$pipeline() } ``` Conclusion - they seem to be missing standard errors? Need to do this through the batch processing. ## EBI Ignore list Need to figure out a solution for storing this. Many EBI datasets don't have sufficient info to be included, so we need to ignore them instead of trying to upload them each time we look for new studies. On the `ieu-ssd` machine where the original set of EBI datasets was downloaded and processed, this is how the first ignore list was determined: ```r library(dplyr) a <- read.csv("/data/ebi_gwascatalog/pmids.txt") b <- scan("/data/ebi_gwascatalog/study_ids.txt", character()) got <- ieugwasr::gwasinfo() %>% subset(., grepl("ebi-a", id)) %>% {.$id} %>% gsub("ebi-a-", "", .) a$got <- a$StudyID %in% got head(a) table(a$got) table(a$got, a$Status) ebi_ignore_list <- a$StudyID[a$Status == "harmonised" & !a$got] write.table(ebi_ignore_list, file="ebi_ignore_list.txt", row=F, col=F, qu=F) ```