This RMarkdown document demonstrates how key elements from the github notebook for the meta functionalities can be achieved using the R package. For detailed explanations of the case study please refer to the notebook on github.
Here we show the following aspects of the EpiGraphDB platform, and how to use the API to get the information:
For detailed documentation on the API endpoints please visit:
library("magrittr")
library("dplyr")
library("purrr")
#>
#> Attaching package: 'purrr'
#> The following object is masked from 'package:magrittr':
#>
#> set_names
library("igraph")
#>
#> Attaching package: 'igraph'
#> The following objects are masked from 'package:purrr':
#>
#> compose, simplify
#> The following objects are masked from 'package:dplyr':
#>
#> as_data_frame, groups, union
#> The following objects are masked from 'package:stats':
#>
#> decompose, spectrum
#> The following object is masked from 'package:base':
#>
#> union
library("epigraphdb")
Here we query for the metadata information using the endpoint
GET /meta/schema
, which will be used for downstream
processing.
endpoint <- "/meta/schema"
params <- list(
graphviz = FALSE,
plot = FALSE
)
metadata <- query_epigraphdb(
route = endpoint, params = params, mode = "raw"
)
metadata %>% str(2)
#> List of 3
#> $ nodes :List of 12
#> ..$ Disease :List of 2
#> ..$ Pathway :List of 2
#> ..$ LiteratureTerm :List of 2
#> ..$ Gene :List of 2
#> ..$ LiteratureTriple:List of 2
#> ..$ Literature :List of 2
#> ..$ Protein :List of 2
#> ..$ Variant :List of 2
#> ..$ Efo :List of 2
#> ..$ Tissue :List of 2
#> ..$ Drug :List of 2
#> ..$ Gwas :List of 2
#> $ edges :List of 41
#> ..$ OPENTARGETS_DRUG_TO_DISEASE :List of 2
#> ..$ MEDRXIV_SUB :List of 2
#> ..$ GENE_TO_DISEASE :List of 2
#> ..$ GWAS_NLP_EFO :List of 2
#> ..$ SEMMEDDB_PREDICATE :List of 2
#> ..$ STRING_INTERACT_WITH :List of 2
#> ..$ GWAS_TO_LITERATURE_TRIPLE :List of 2
#> ..$ PRS :List of 2
#> ..$ BIORXIV_PREDICATE :List of 2
#> ..$ MONDO_MAP_UMLS :List of 2
#> ..$ SEMMEDDB_SUB :List of 2
#> ..$ MR_EVE_MR :List of 2
#> ..$ OPENGWAS_TOPHITS :List of 2
#> ..$ VARIANT_TO_GENE :List of 2
#> ..$ MEDRXIV_PREDICATE :List of 2
#> ..$ MEDRXIV_OBJ :List of 2
#> ..$ EXPRESSED_IN :List of 2
#> ..$ BIORXIV_OBJ :List of 2
#> ..$ EFO_CHILD_OF :List of 2
#> ..$ BIORXIV_TO_LIT :List of 2
#> ..$ GEN_COR :List of 2
#> ..$ METAMAP_LITE :List of 2
#> ..$ SEMMEDDB_TO_LIT :List of 2
#> ..$ PROTEIN_IN_PATHWAY :List of 2
#> ..$ MONDO_MAP_EFO :List of 2
#> ..$ PATHWAY_CHILD_OF :List of 2
#> ..$ TERM_TO_GENE :List of 2
#> ..$ GWAS_EFO_EBI :List of 2
#> ..$ CPIC :List of 2
#> ..$ XQTL_MULTI_SNP_MR :List of 2
#> ..$ XQTL_SINGLE_SNP_MR_SNP_GENE :List of 2
#> ..$ OBS_COR :List of 2
#> ..$ GWAS_NLP :List of 2
#> ..$ XQTL_SINGLE_SNP_MR_GENE_GWAS:List of 2
#> ..$ MEDRXIV_TO_LIT :List of 2
#> ..$ BIORXIV_SUB :List of 2
#> ..$ OPENTARGETS_DRUG_TO_TARGET :List of 2
#> ..$ GWAS_TO_VARIANT :List of 2
#> ..$ GWAS_TO_LITERATURE :List of 2
#> ..$ GENE_TO_PROTEIN :List of 2
#> ..$ SEMMEDDB_OBJ :List of 2
#> $ connections:List of 41
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
#> ..$ :List of 4
We can extract the specific meta node information as a dataframe from the metadata.
meta_node_df <- metadata %>%
pluck("nodes") %>%
{
names <- names(.)
transpose(.) %>%
as_tibble() %>%
mutate(meta_node = names) %>%
# Hide properties column which does not display well
select(meta_node, count) %>%
# We also need to flatten count
mutate(count = flatten_int(count))
}
meta_node_df %>%
arrange(meta_node) %>%
mutate(count = format(count, big.mark = ","))
#> # A tibble: 12 × 2
#> meta_node count
#> <chr> <chr>
#> 1 Disease " 38,960"
#> 2 Drug " 2,697"
#> 3 Efo " 25,390"
#> 4 Gene " 57,737"
#> 5 Gwas " 34,494"
#> 6 Literature "3,995,672"
#> 7 LiteratureTerm " 108,905"
#> 8 LiteratureTriple "5,609,945"
#> 9 Pathway " 2,441"
#> 10 Protein " 20,280"
#> 11 Tissue " 54"
#> 12 Variant " 99,005"
We can also extract the meta relationship (edge) information, and the connections.
meta_rel_df <- metadata %>%
pluck("edges") %>%
{
names <- names(.)
transpose(.) %>%
as_tibble() %>%
mutate(meta_rel = names) %>%
mutate(count = flatten_int(count)) %>%
select(meta_rel, count)
} %>%
inner_join(
metadata %>% pluck("connections") %>%
{
transpose(.) %>%
as_tibble() %>%
mutate(meta_rel = flatten_chr(rel)) %>%
mutate_at(vars(from_node, to_node), flatten_chr) %>%
select(meta_rel, from_node, to_node)
}
)
#> Joining, by = "meta_rel"
meta_rel_df %>%
arrange(from_node, to_node) %>%
mutate(count = format(count, big.mark = ","))
#> # A tibble: 41 × 4
#> meta_rel count from_node to_node
#> <chr> <chr> <chr> <chr>
#> 1 MONDO_MAP_EFO " 2,819" Disease Efo
#> 2 MONDO_MAP_UMLS " 8,247" Disease LiteratureTerm
#> 3 OPENTARGETS_DRUG_TO_DISEASE " 2,461" Drug Disease
#> 4 CPIC " 375" Drug Gene
#> 5 OPENTARGETS_DRUG_TO_TARGET " 6,534" Drug Gene
#> 6 EFO_CHILD_OF " 43,132" Efo Efo
#> 7 GENE_TO_DISEASE " 5,763" Gene Disease
#> 8 XQTL_MULTI_SNP_MR " 3,015,233" Gene Gwas
#> 9 XQTL_SINGLE_SNP_MR_GENE_GWAS " 8,449,779" Gene Gwas
#> 10 GENE_TO_PROTEIN " 19,142" Gene Protein
#> # … with 31 more rows
Users can use the explorer on the Web UI to search for a specific node by:
Here we show how these are done at the API level using
Gwas
nodes as an example.
First we need to know what the “ID” and “name” fields are for the
meta nodes using GET /meta/nodes/id-name-schema
:
endpoint <- "/meta/nodes/id-name-schema"
meta_node_fields <- query_epigraphdb(
route = endpoint, params = NULL, mode = "raw"
)
meta_node_fields
#> $Disease
#> $Disease$id
#> [1] "id"
#>
#> $Disease$name
#> [1] "label"
#>
#>
#> $Drug
#> $Drug$id
#> [1] "label"
#>
#> $Drug$name
#> [1] "label"
#>
#>
#> $Efo
#> $Efo$id
#> [1] "id"
#>
#> $Efo$name
#> [1] "value"
#>
#>
#> $Gene
#> $Gene$id
#> [1] "ensembl_id"
#>
#> $Gene$name
#> [1] "name"
#>
#>
#> $Gwas
#> $Gwas$id
#> [1] "id"
#>
#> $Gwas$name
#> [1] "trait"
#>
#>
#> $Literature
#> $Literature$id
#> [1] "id"
#>
#> $Literature$name
#> [1] "id"
#>
#>
#> $LiteratureTerm
#> $LiteratureTerm$id
#> [1] "id"
#>
#> $LiteratureTerm$name
#> [1] "name"
#>
#>
#> $LiteratureTriple
#> $LiteratureTriple$id
#> [1] "id"
#>
#> $LiteratureTriple$name
#> [1] "name"
#>
#>
#> $Pathway
#> $Pathway$id
#> [1] "id"
#>
#> $Pathway$name
#> [1] "name"
#>
#>
#> $Protein
#> $Protein$id
#> [1] "uniprot_id"
#>
#> $Protein$name
#> [1] "uniprot_id"
#>
#>
#> $Tissue
#> $Tissue$id
#> [1] "id"
#>
#> $Tissue$name
#> [1] "name"
#>
#>
#> $Variant
#> $Variant$id
#> [1] "name"
#>
#> $Variant$name
#> [1] "name"
Here we search for nodes can contain “body mass index” in their traits.
name <- "body mass index"
endpoint <- "/meta/nodes/Gwas/search"
params <- list(name = name)
results <- query_epigraphdb(
route = endpoint, params = params, mode = "table"
)
results
#> # A tibble: 10 × 18
#> node.note node._name node.year node.mr node.author node.sex node.pmid
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Dominance model… Body mass … 2016.0 0 Wood Males a… 26961502…
#> 2 <NA> Body mass … 2015.0 1 Locke AE Females 25673413…
#> 3 <NA> Body mass … 2013.0 1 Randall JC Females 23754948…
#> 4 <NA> Body mass … 2017.0 1 Akiyama M <NA> 28892062…
#> 5 <NA> Body mass … 2018.0 1 Hoffmann TJ <NA> 30108127…
#> 6 <NA> Body mass … 2019.0 1 Ishigaki K Males 28892062…
#> 7 <NA> Body mass … 2015.0 1 Locke AE Males a… 25673413…
#> 8 <NA> Body mass … 2015.0 1 Locke AE Males a… 25673413…
#> 9 <NA> Body mass … 2015.0 1 Locke AE Males 25673413…
#> 10 <NA> Body mass … 2019.0 1 Ishigaki K Males a… 28892062…
#> # … with 11 more variables: node.population <chr>, node.sample_size <chr>,
#> # node.nsnp <chr>, node.build <chr>, node.trait <chr>, node._source <list>,
#> # node.id <chr>, node._id <chr>, node.subcategory <chr>, node.category <chr>,
#> # node.sd <chr>
Similarly, we can exact match a specific node by its ID.
id <- "ieu-a-2"
endpoint <- "/meta/nodes/Gwas/search"
params <- list(id = id)
results <- query_epigraphdb(
route = endpoint, params = params, mode = "table"
)
results
#> # A tibble: 1 × 17
#> node._name node.year node.mr node.author node.sex node.pmid node.population
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 Body mass … 2015.0 1 Locke AE Males and… 25673413… Mixed
#> # … with 10 more variables: node.sd <chr>, node.sample_size <chr>,
#> # node.nsnp <chr>, node.build <chr>, node.trait <chr>, node._source <list>,
#> # node.id <chr>, node._id <chr>, node.subcategory <chr>, node.category <chr>
Advanced users that are familiar with Neo4j Cypher can query the database using Cypher directly.
query <- "
MATCH (exposure:Gwas)-[mr:MR]->(outcome:Gwas)
WHERE exposure.trait = 'Body mass index'
RETURN exposure, outcome, mr LIMIT 2
"
endpoint <- "/cypher"
params <- list(query = query)
# NOTE this is a POST request
results <- query_epigraphdb(
route = endpoint, params = params, method = "POST",
mode = "table"
)
results
#> # A tibble: 0 × 0
sessionInfo
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.3 LTS
#>
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] igraph_1.2.11 purrr_0.3.4 magrittr_2.0.1 epigraphdb_0.2.3
#> [5] dplyr_1.0.7
#>
#> loaded via a namespace (and not attached):
#> [1] knitr_1.37 tidyselect_1.1.1 R6_2.5.1 rlang_0.4.12
#> [5] fansi_1.0.0 stringr_1.4.0 httr_1.4.2 tools_4.1.2
#> [9] xfun_0.29 utf8_1.2.2 cli_3.1.0 DBI_1.1.2
#> [13] ellipsis_0.3.2 assertthat_0.2.1 tibble_3.1.6 lifecycle_1.0.1
#> [17] crayon_1.4.2 vctrs_0.3.8 curl_4.3.2 glue_1.6.0
#> [21] evaluate_0.14 stringi_1.7.6 compiler_4.1.2 pillar_1.6.4
#> [25] generics_0.1.1 jsonlite_1.7.2 pkgconfig_2.0.3