---
title: "GPMap tutorial: Haemoglobin concentration case study"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{GPMap tutorial: Haemoglobin concentration case study}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  message = FALSE,
  warning = FALSE,
  eval = TRUE,
  out.width = "100%",
  fig.align = "center"
)
```

This is a case study of GPMAP genetic colocalisations with the UK Biobank complex trait "Haemoglobin concentration", a physiological measure underpinned by well understood biological mechanisms of iron uptake, handling and storage and erythropoiesis, which potentially sits downstream of many disease states.

This vignette is a demonstrative example of Figure 4 in [the GPMap paper](https://doi.org/10.64898/2026.02.19.26346618).

We wish to demonstrate that the GPMap:

- Returns relevant colocalising signals between a physiological state and the molecular features/genes involved in regulation of that state
- Improves understanding of the diverse factors contributing to trait variability by enabling colocalising complex traits to be clustered based on the signals they share
- Can reveal new biology/avenues for therapeutic intervention

## Trait colocalisations

**Anchor trait:** Haemoglobin concentration (id: 4759)

```{r find-trait}
all_genes <- gpmapr::all_genes()
all_traits_df <- gpmapr::all_traits()
all_haemoglobin_traits <- gpmapr::search_gpmap("haemoglobin") |>
  dplyr::select(type_id, type, name, sample_size, num_coloc_groups)
anchor_trait_id <- 4759

trait_results <- gpmapr::trait(trait_id = anchor_trait_id)
haemoglobin_common <- trait_results$coloc_groups
haemoglobin_rare <- trait_results$rare_results
knitr::kable(all_haemoglobin_traits)
```

Exclude duplicate haemoglobin traits and filter on genome-wide significance:

```{r filter-traits}
# Identify other haemoglobin traits to exclude
trait_ids_hgb <- dplyr::filter(all_haemoglobin_traits, type_id != 4759)
traits_out <- trait_ids_hgb$type_id

# Exclude coloc groups where the anchor trait has no genome-wide significant hit in the region
colocs_out_common <- haemoglobin_common |>
  dplyr::filter(trait_id == anchor_trait_id & min_p > 5e-8) |>
  dplyr::pull(coloc_group_id)

colocs_out_rare <- haemoglobin_rare |>
  dplyr::filter(trait_id == anchor_trait_id & min_p > 5e-8) |>
  dplyr::pull(rare_result_group_id)

haemoglobin_common <- haemoglobin_common |>
  dplyr::filter(!(trait_id %in% traits_out), min_p < 5e-8,
                !(coloc_group_id %in% colocs_out_common))
haemoglobin_rare <- haemoglobin_rare |>
  dplyr::filter(!(trait_id %in% traits_out), min_p < 5e-8,
                !(rare_result_group_id %in% colocs_out_rare))

# Remove coloc/rare groups with only one study after filtering
solo_common <- names(table(haemoglobin_common$coloc_group_id))[table(haemoglobin_common$coloc_group_id) == 1]
haemoglobin_common <- haemoglobin_common |> dplyr::filter(!(coloc_group_id %in% solo_common))
solo_rare <- names(table(haemoglobin_rare$rare_result_group_id))[table(haemoglobin_rare$rare_result_group_id) == 1]
haemoglobin_rare <- haemoglobin_rare |> dplyr::filter(!(rare_result_group_id %in% solo_rare))
```

### Genome view

Colocalising signals across the genome, annotated by variant class and number of colocalising studies:

```{r genome-view, fig.width=15, fig.height=3.5}
n_per_locus_common <- haemoglobin_common |>
  dplyr::group_by(display_snp, chr, bp) |>
  dplyr::summarise(n_studies = dplyr::n(), .groups = "drop")

common_colocs_per_position <- haemoglobin_common |>
  dplyr::filter(trait_id == anchor_trait_id) |>
  dplyr::group_by(display_snp, chr, bp) |>
  dplyr::summarise(min_p = min(min_p, na.rm = TRUE), .groups = "drop") |>
  dplyr::inner_join(n_per_locus_common, by = c("display_snp", "chr", "bp")) |>
  dplyr::mutate(
    neg_log10_p = -log10(pmax(min_p, .Machine$double.xmin)),
    variant_type = "common"
  )

n_per_locus_rare <- haemoglobin_rare |>
  dplyr::group_by(display_snp, chr, bp) |>
  dplyr::summarise(n_studies = dplyr::n(), .groups = "drop")

rare_colocs_per_position <- haemoglobin_rare |>
  dplyr::filter(trait_id == anchor_trait_id) |>
  dplyr::group_by(display_snp, chr, bp) |>
  dplyr::summarise(min_p = min(min_p, na.rm = TRUE), .groups = "drop") |>
  dplyr::inner_join(n_per_locus_rare, by = c("display_snp", "chr", "bp")) |>
  dplyr::mutate(
    neg_log10_p = -log10(pmax(min_p, .Machine$double.xmin)),
    variant_type = "rare"
  )

colocs_per_position <- dplyr::bind_rows(common_colocs_per_position,
                                        rare_colocs_per_position) |>
  dplyr::mutate(chr_lab = as.character(chr))

chr_order_lbl <- c(as.character(1:22), "X", "Y")
chr_offsets <- all_genes |>
  dplyr::mutate(chr_lab = as.character(chr)) |>
  dplyr::group_by(chr_lab) |>
  dplyr::summarise(chr_len = max(stop, na.rm = TRUE), .groups = "drop") |>
  dplyr::mutate(
    ord = match(chr_lab, chr_order_lbl),
    ord = dplyr::if_else(is.na(ord), 999L, ord)
  ) |>
  dplyr::arrange(ord, chr_lab) |>
  dplyr::mutate(offset = cumsum(c(0, utils::head(chr_len, -1))))

colocs_per_position <- colocs_per_position |>
  dplyr::left_join(chr_offsets, by = "chr_lab") |>
  dplyr::mutate(x_mbp = (bp + offset) / 1e6)

axis_chr <- chr_offsets |>
  dplyr::filter(chr_lab %in% unique(colocs_per_position$chr_lab)) |>
  dplyr::mutate(x_mbp_mid = (offset + chr_len / 2) / 1e6) |>
  dplyr::arrange(ord)

colocs_per_position |>
  ggplot2::ggplot(ggplot2::aes(x = x_mbp, y = neg_log10_p,
                                colour = variant_type, size = n_studies)) +
  ggplot2::geom_point(alpha = 0.85) +
  ggplot2::scale_colour_manual(name = "Variant class:",
                               values = c(common = "steelblue", rare = "tomato")) +
  ggplot2::scale_size_continuous(name = "No. colocalising\nstudy rows\n(at locus)") +
  ggplot2::scale_x_continuous(
    breaks = axis_chr$x_mbp_mid,
    labels = axis_chr$chr_lab,
    expand = ggplot2::expansion(mult = c(0.01, 0.01))
  ) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(size = 11),
                 axis.title = ggplot2::element_text(size = 14),
                 panel.grid.major.x = ggplot2::element_blank(),
                 panel.grid.minor = ggplot2::element_blank(),
                 legend.position = "bottom") +
  ggplot2::labs(
    x = "Chromosome (cumulative position, Mb)",
    y = "-log10(P), anchor trait",
    title = "Colocalisation groups for anchor trait"
  )
```

This is a recreation of the data you can see on the GPMap website at https://gpmap.opengwas.io/trait.html?id=4759

### Complex trait colocalisations

Traits across which trait categories show the greatest proportion of GWS hits colocalising with the anchor trait:

```{r traitcategory-view, fig.width=20, fig.height=6}
pull_trait_ids_common <- haemoglobin_common |>
  dplyr::filter(data_type %in% c("Cell Trait", "Plasma Protein", "Phenotype")) |>
  dplyr::select(trait_id, trait_name, data_type, trait_category) |>
  dplyr::distinct()

# Use num_study_extractions from all_traits as a proxy for GWS hits per trait
n_studyhits_common <- all_traits_df |>
  dplyr::select(trait_id = id, n_studyhits = num_study_extractions) |>
  dplyr::filter(trait_id %in% pull_trait_ids_common$trait_id)

common_colocs_per_trait <- haemoglobin_common |>
  dplyr::group_by(trait_id) |>
  dplyr::summarise(n_colocs = dplyr::n(), .groups = "drop") |>
  dplyr::inner_join(pull_trait_ids_common, by = "trait_id") |>
  dplyr::inner_join(n_studyhits_common, by = "trait_id") |>
  dplyr::filter(trait_id != anchor_trait_id, n_colocs > 1, !is.na(trait_category),
                n_studyhits > 0) |>
  dplyr::mutate(
    prop_colocing_hits = (n_colocs / n_studyhits) * 100,
    colocs_cut = cut(n_colocs, breaks = c(1, 5, 10, 20, Inf))
  ) |>
  dplyr::arrange(trait_category, dplyr::desc(prop_colocing_hits)) |>
  dplyr::group_by(trait_category) |>
  dplyr::mutate(label = stringr::str_wrap(
    ifelse((prop_colocing_hits == max(prop_colocing_hits) & prop_colocing_hits > 10) |
             prop_colocing_hits > 25, trait_name, NA),
    width = 25
  )) |>
  dplyr::ungroup()

n_cats <- length(unique(common_colocs_per_trait$trait_category))
cols_cats <- setNames(
  c(RColorBrewer::brewer.pal(min(12, n_cats), "Paired"),
    if (n_cats > 12) RColorBrewer::brewer.pal(n_cats - 12, "Dark2") else character(0)),
  unique(common_colocs_per_trait$trait_category)
)
cols_cats <- c(cols_cats, Molecular = "bisque")

cut_groups <- c("(1,5]" = "2-5", "(5,10]" = "6-10", "(10,20]" = "11-20", "(20,Inf]" = ">20")

set.seed(40)
pos <- ggplot2::position_jitter(width = 0.4, height = 0)

common_colocs_per_trait |>
  ggplot2::ggplot(ggplot2::aes(x = trait_category, y = prop_colocing_hits,
                               colour = trait_category)) +
  ggplot2::geom_jitter(position = pos, ggplot2::aes(size = colocs_cut)) +
  ggrepel::geom_text_repel(ggplot2::aes(label = label),
                           position = pos, force = 20, max.overlaps = Inf,
                           point.padding = ggplot2::unit(0.5, "lines"),
                           box.padding = ggplot2::unit(0.5, "lines"),
                           size = 5, show.legend = FALSE) +
  ggplot2::scale_size_manual(name = "# coloc\ngroups", values = 3:9, labels = cut_groups) +
  ggplot2::scale_colour_manual(name = "", values = cols_cats) +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_blank(),
                 axis.title = ggplot2::element_text(size = 15),
                 panel.grid = ggplot2::element_blank(),
                 legend.position = "bottom",
                 legend.text = ggplot2::element_text(size = 12)) +
  ggplot2::labs(x = "", y = "Proportion of study hits\ncolocalising with anchor trait") +
  ggplot2::guides(colour = ggplot2::guide_legend(ncol = 6,
                                                 override.aes = list(size = 4)),
                  size = ggplot2::guide_legend(ncol = 1))
```


## Investigating a specific colocalisation group

### TMPRSS6: eQTL/mQTL/sQTL and physiological measures

TMPRSS6 is a negative regulator of the iron homeostasis hormone hepcidin (HAMP).

```{r TMPRSS6-setup}
# Filter to TMPRSS6 coloc groups
tmprss6_coloc_group_ids <- haemoglobin_common |>
  dplyr::filter(gene == "TMPRSS6") |>
  dplyr::pull(coloc_group_id) |>
  unique()

haemoglobin_tmprss6_all <- haemoglobin_common |>
  dplyr::filter(coloc_group_id %in% tmprss6_coloc_group_ids)

# Find the coloc group which has the most distinct data types
tmprss6_richest_coloc_id <- haemoglobin_tmprss6_all |>
  dplyr::group_by(coloc_group_id) |>
  dplyr::summarise(n_data_types = dplyr::n_distinct(data_type), .groups = "drop") |>
  dplyr::arrange(dplyr::desc(n_data_types), coloc_group_id) |>
  dplyr::slice(1) |>
  dplyr::pull(coloc_group_id)

haemoglobin_tmprss6 <- haemoglobin_tmprss6_all |>
  dplyr::filter(coloc_group_id == tmprss6_richest_coloc_id)

variant_id_tmprss6 <- haemoglobin_tmprss6$variant_id |> unique() |> head(1)
bp_tmprss6 <- haemoglobin_tmprss6$bp[1]

# Fetch summary statistics for the LD block containing the candidate SNP.
# summary_stats is a named list keyed by study_id; columns are uppercase (BP, SE, LBF_1, SNP, BETA, ...).
tmprss6_variant <- gpmapr::variant(variant_id_tmprss6, include_summary_stats = TRUE)

# Trait labels live on coloc_groups, not always on variant$study_extractions
se_tmprss6 <- tmprss6_variant$study_extractions |>
  dplyr::filter(id %in% haemoglobin_tmprss6$study_extraction_id)

meta_tmprss6 <- haemoglobin_tmprss6 |>
  dplyr::select(study_id, study_extraction_id, trait_id, trait_name, trait_category, data_type) |>
  dplyr::distinct()
```

The GPMap found the candidate SNP to be `r tmprss6_variant$variant$snp`, which VEP has annotated as an `r tmprss6_variant$variant$consequence` of `r tmprss6_variant$variant$symbol`.

### Region plot: scaled LBF values across all studies in the coloc group
```{r TMPRSS6-region, fig.width=8, fig.height=2}

tmprss6_gene <- all_genes |>
  dplyr::filter(gene == "TMPRSS6")

plot_region <- function(se_data, summary_stats, bp_min, bp_max, tag_snp_bp) {
  cols <- c(RColorBrewer::brewer.pal(12, "Paired"), RColorBrewer::brewer.pal(8, "Dark2"))

  regions <- dplyr::bind_rows(lapply(seq_len(nrow(se_data)), function(i) {
    ss <- summary_stats[[as.character(se_data$study_id[i])]]
    if (is.null(ss) || !"LBF_1" %in% names(ss)) return(NULL)
    ss <- ss |>
      dplyr::filter(SE != 1) |>  # Remove padded points for GoDMC studies
      dplyr::mutate(trait_id   = se_data$trait_id[i],
                    scaled_LBF = LBF_1 / max(abs(LBF_1), na.rm = TRUE))
    return(ss)
  }))

  if (nrow(regions) == 0) {
    stop("No summary statistics rows to plot; check study_id vs summary_stats names.")
  }

  p <- ggplot2::ggplot(regions,
                       ggplot2::aes(x = BP / 1e6, y = scaled_LBF,
                                    colour = as.factor(trait_id))) +
    ggplot2::geom_point(size = 1) +
    ggplot2::geom_vline(xintercept = tag_snp_bp / 1e6) +
    ggplot2::scale_x_continuous(limits = c(bp_min / 1e6, bp_max / 1e6)) +
    ggplot2::scale_colour_manual(name = "trait",
                                 values = cols[seq_len(length(unique(regions$trait_id)))]) +
    ggplot2::theme_minimal() +
    ggplot2::theme(legend.position = "none") +
    ggplot2::labs(x = "Position (Mb)", y = "Scaled LBF")
  return(p)
}

plot_region(se_tmprss6,
  tmprss6_variant$summary_stats,
  bp_min = tmprss6_gene$start - 50000,
  bp_max = tmprss6_gene$stop + 50000,
  tag_snp_bp = bp_tmprss6
)
```

We can also plot the effect estimates at the candidate SNP across the colocalising studies.

```{r TMPRSS6-betas, fig.width=6, fig.height=3}
# Effect estimates at candidate SNP across colocalising studies
plot_coloc_betas <- function(betas, title) {
  if (nrow(betas) == 0 || !"trait_category" %in% names(betas)) {
    stop("No beta rows or missing trait_category after join to coloc_groups metadata.")
  }
  betas <- dplyr::filter(betas, !is.na(trait_category), !is.na(trait_name))
  p <- betas |>
    ggplot2::ggplot(ggplot2::aes(y = trait_name, x = beta_alt)) +
    ggplot2::geom_vline(xintercept = 0, colour = "red", lty = 2) +
    ggplot2::geom_point(size = 2) +
    ggplot2::geom_errorbar(ggplot2::aes(xmin = beta_alt - 1.96 * .data[["SE"]],
                                        xmax = beta_alt + 1.96 * .data[["SE"]]),
                           width = 0.1) +
    ggplot2::facet_grid(trait_category ~ ., scale = "free", space = "free",
                        switch = "y") +
    ggplot2::theme_bw() +
    ggplot2::theme(panel.grid = ggplot2::element_blank(),
                   panel.border = ggplot2::element_blank(),
                   strip.background = ggplot2::element_rect(colour = NA),
                   plot.title = ggplot2::element_text(hjust = 0.5, size = 12)) +
    ggplot2::labs(x = "Study beta", y = "", title = title)
  return(p)
}

betas_tmprss6 <- dplyr::bind_rows(lapply(seq_len(nrow(se_tmprss6)), function(i) {
  ss <- tmprss6_variant$summary_stats[[as.character(se_tmprss6$study_id[i])]]
  if (is.null(ss)) return(NULL)
  ss <- ss |>
    dplyr::filter(grepl("^22:37100807", SNP)) |>
    dplyr::mutate(
      beta_alt = BETA * -1,
      study_id = se_tmprss6$study_id[i]
    )
  return(ss)
})) |>
  dplyr::left_join(meta_tmprss6, by = "study_id") |>
  dplyr::mutate(
    trait_category = dplyr::if_else(data_type != "Phenotype", "Molecular", trait_category)
  )

plot_coloc_betas(betas_tmprss6,
                 title = "Co-localising variant:\nrs9619658 (22:37100807 C/T)")
```

## Trait clustering

Of all the traits that colocalise with the anchor, how many coloc groups do each pair of traits share?

```{r trait-clustering, fig.width=6, fig.height=6, cache=TRUE}
# Remove duplicates crudely by name
traits_keep <- common_colocs_per_trait |>
  dplyr::mutate(name_cut = sub(" \\(.*\\)", "", trait_name)) |>
  dplyr::arrange(name_cut, dplyr::desc(n_studyhits)) |>
  dplyr::filter(!duplicated(name_cut)) |>
  dplyr::pull(trait_id)

# Compute pairwise shared coloc groups
trait_pairs <- expand.grid(trait1 = traits_keep, trait2 = traits_keep,
                           stringsAsFactors = FALSE)
traits_by_group <- split(haemoglobin_common$trait_id, haemoglobin_common$coloc_group_id)

trait_pairs$shared_colocs <- apply(trait_pairs, 1, function(x) {
  t1 <- x["trait1"]
  t2 <- x["trait2"]
  return(sum(vapply(traits_by_group, function(v) t1 %in% v & t2 %in% v, logical(1))))
})

# Convert to matrix, cluster, and reorder
mx_colocs <- reshape2::acast(trait_pairs, trait1 ~ trait2, value.var = "shared_colocs")
mx_clust  <- hclust(dist(mx_colocs))
mx_colocs <- mx_colocs[mx_clust$order, mx_clust$order]

mx_colocs_melt <- reshape2::melt(mx_colocs)
colnames(mx_colocs_melt) <- c("trait1", "trait2", "coloc_hits")
mx_colocs_melt <- mx_colocs_melt |>
  dplyr::mutate(log_coloc_hits = dplyr::if_else(coloc_hits == 0, NA_real_,
                                                log10(coloc_hits)))

ggplot2::ggplot(mx_colocs_melt,
                ggplot2::aes(
                  x = factor(trait1, levels = unique(mx_colocs_melt$trait1)),
                  y = factor(trait2, levels = unique(mx_colocs_melt$trait1)),
                  colour = log_coloc_hits
                )) +
  ggplot2::geom_point(shape = 20, size = 0.2) +
  ggplot2::scale_colour_gradient2(name = "log10(no. colocalising SNPs)",
                                  low = "#0072B2", high = "#E69F00",
                                  mid = "#F3CF80", midpoint = 1.3,
                                  na.value = "white") +
  ggplot2::theme_minimal() +
  ggplot2::theme(legend.position = "none",
                 axis.title = ggplot2::element_blank(),
                 panel.grid = ggplot2::element_blank(),
                 axis.text = ggplot2::element_blank())
```

We can also view the specific clusters that were found.
```{r trait-clustering-table}
# Define how many clusters you want (adjust 'k' as needed)
n_clusters <- 10
cluster_assignments <- stats::cutree(mx_clust, k = n_clusters)

# cutree names are character rownames; trait_id in common_colocs_per_trait is integer
trait_clusters <- dplyr::tibble(
  trait_id = as.integer(names(cluster_assignments)),
  cluster_id = as.factor(unname(cluster_assignments))
) |>
  dplyr::left_join(
    common_colocs_per_trait |>
      dplyr::select(trait_id, trait_name) |>
      dplyr::distinct(),
    by = "trait_id"
  ) |>
  dplyr::arrange(cluster_id)

# View the groups in a condensed format (truncate long trait lists for readability)
grouped_summary <- trait_clusters |>
  dplyr::group_by(cluster_id) |>
  dplyr::summarize(
    traits = stringr::str_trunc(
      paste(stats::na.omit(trait_name), collapse = ", "),
      width = 500
    ),
    .groups = "drop"
  )

knitr::kable(grouped_summary)
```