Here is an example of mapping CHR:POS:REF:ALT columns to dbSNP build 156 RSIDs.
library(genepi.utils)
gwas <- data.table::fread(system.file("extdata", "example_gwas_sumstats.tsv", package="genepi.utils"))
head(gwas)
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
build = "b37_dbsnp156")
head(gwas_with_rsids)
#> chromosome base_pair_location effect_allele other_allele RSID
#> <int> <int> <char> <char> <char>
#> 1: 10 100000625 G A rs7899632
#> 2: 10 100000645 C A rs61875309
#> 3: 10 100003242 G T rs12258651
#> 4: 10 100003785 C T rs1359508
#> 5: 10 100004360 A G rs1048754
Currently we have the latest dbSNP build 156 available for hg37 and hg38.
The mapping can take some time, depending on the number of cores on
your machine. To monitor progress we have to use the form below. We need
to wrap the function in progressr::with_progress({ })
.
In the dbSNP database there are multiple RSID assigned to the same
CHR:POS:REF:ALT combination. This function will return the first RSID
encountered and filter out the rest. If you want to get the alternative
RSIDs then use the alt_rsids=TRUE
flag. This will change
the structure of the output to a named list()
of two
data.table
elements: 1) “data”=gwas_data and 2)
“alt_rsids”=alt_rsid_data.
Although matching on alleles is desirable, especially for indels, we
can still match on just chromosome and position by either leaving
ea_col
and nea_col
arguments out, or setting
them to NULL
.
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
build = "b37_dbsnp156")
str(gwas_with_rsids)
Alleles are not always aligned to the reference and so we allow
allele flipping by default with the parameter flip="allow"
.
If you would like to flag which variants have been assigned an rsID
based on flipped alleles use flip="report"
; this will add a
logical column to the output called rsid_flip_match
. To
prevent allele flipping and only assign based on the reference allele
orientation use flip="no_flip"
.
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(gwas,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
flip = "report",
build = "b37_dbsnp156")
str(gwas_with_rsids)
Coding of indels as D/I is allowed, however please check the alternate rsID output as mutliple rsIDs (indels) may be present for D/I coding at any particular base position.
gwas_id_coding <- data.table::fread(system.file("extdata", "example3_gwas_sumstats.tsv", package="genepi.utils"))
gwas_with_rsids <- genepi.utils::chrpos_to_rsid(dt = gwas_id_coding,
chr_col = "chromosome",
pos_col = "base_pair_location",
ea_col = "effect_allele",
nea_col = "other_allele",
flip = "allow",
build = "b37_dbsnp156",
alt_rsids=TRUE)
str(gwas_with_rsids)
The choice of parameters will have a small impact computation speed.
Since the dbSNP data is stored as .fst
binary files and
only the desired rows / columns are ever read into memory, the more data
that is requested the slower the computation. That said, it is still
much faster / feasible than trying to read in the entire dbSNP database
(over 1 billion rsIDs).
library(microbenchmark)
library(ggplot2)
# some GWAS data
dt <- data.table::fread( gwas_sumstats_4.3million_rows )
# benchmarking
mbm <- microbenchmark("chrpos_to_rsid: no alleles, no alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col=NULL,
nea_col=NULL,
build="b37_dbsnp156",
alt_rsids=FALSE,
flip="report")
})
},
"chrpos_to_rsid: no alleles, with alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col=NULL,
nea_col=NULL,
build="b37_dbsnp156",
alt_rsids=TRUE,
flip="report")
})
},
"chrpos_to_rsid: with alleles, no alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col="EFFECT_ALLELE",
nea_col="OTHER_ALLELE",
build="b37_dbsnp156",
alt_rsids=FALSE,
flip="report")
})
},
"chrpos_to_rsid: with alleles, with alt rsids" = {
progressr::with_progress({
genepi.utils::chrpos_to_rsid(dt,
chr_col="CHR",
pos_col="POS",
ea_col="EFFECT_ALLELE",
nea_col="OTHER_ALLELE",
build="b37_dbsnp156",
alt_rsids=TRUE,
flip="report")
})
},
times = 10L)
# plot
autoplot(mbm)