# library(devtools)
# load_all()
# ieugwasr::select_api("dev2")
# Authorise
ieugwasr::get_access_token()
# This is a small file, good for testing
ebi_id <- "GCST005522"
x <- EbiDataset$new(
ebi_id = ebi_id,
ftp_path = get_ftp_path(ebi_id),
igd_id = "ebi-a-EBITEST",
wd = paste0("uploads/", ebi_id)
)
x$pipeline()
Now delete:
ebi_id <- "GCST005522"
x <- EbiDataset$new(
ebi_id = ebi_id,
ftp_path = get_ftp_path(ebi_id),
igd_id = "ebi-a-EBITEST",
wd = paste0("uploads/", ebi_id)
)
x$download_dataset()
x$format_ebi_dataset()
x$determine_columns(params=list(chr_col=1, snp_col=2, pos_col=3, oa_col=4, ea_col=5, eaf_col=6, beta_col=7, se_col=8, pval_col=9), gwas_file=x$gwas_out1)
x$format_dataset(gwas_file=x$gwas_out1)
x$organise_metadata()
x$api_metadata_upload()
x$api_gwasdata_upload()
x$api_gwasdata_delete()
To get a list of datasets that need to be imported:
ignore_list <- scan(system.file(package="EbiDataImport", "extdata/ebi_ignore_list.txt"), character())
newdats <- determine_new_datasets(blacklist=ignore_list, exclude = FALSE)
suhre <- subset(newdats, grepl("Suhr", path))
suhre <- separate(suhre, path, sep="/", into=c("p1","p2","p3"), remove=F)
temp <- do.call(rbind, strsplit(suhre$p3, split="_"))
suhre$igd_id <- paste0("prot-c-", temp[,1], "_", temp[,2])
suhre$igd_id
traitinfo <- data.table::fread("http://metabolomics.helmholtz-muenchen.de/pgwas/download/probeanno.tsv", sep="\t")
traitinfo$igd_id <- paste0("prot-c-", traitinfo$seqid)
suhre <- merge(suhre, traitinfo, by="igd_id")
save(suhre, file="suhre.rdata")
load("suhre.rdata")
for(i in 1:nrow(suhre))
{
x <- EbiDataset$new(
ebi_id=suhre$ebi_id[i],
ftp_path=suhre$path[i],
igd_id=suhre$igd_id[i],
traitname=suhre$target[i]
)
x$download_dataset()
x$pipeline()
}
Conclusion - they seem to be missing standard errors? Need to do this through the batch processing.
Need to figure out a solution for storing this. Many EBI datasets don’t have sufficient info to be included, so we need to ignore them instead of trying to upload them each time we look for new studies.
On the ieu-ssd
machine where the original set of EBI
datasets was downloaded and processed, this is how the first ignore list
was determined:
library(dplyr)
a <- read.csv("/data/ebi_gwascatalog/pmids.txt")
b <- scan("/data/ebi_gwascatalog/study_ids.txt", character())
got <- ieugwasr::gwasinfo() %>% subset(., grepl("ebi-a", id)) %>% {.$id} %>% gsub("ebi-a-", "", .)
a$got <- a$StudyID %in% got
head(a)
table(a$got)
table(a$got, a$Status)
ebi_ignore_list <- a$StudyID[a$Status == "harmonised" & !a$got]
write.table(ebi_ignore_list, file="ebi_ignore_list.txt", row=F, col=F, qu=F)