--- title: "Sample summary" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Sample summary} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ## Create Metaboprep object ```{r setup} library(metaboprep) # import data data <- read.csv(system.file("extdata", "dummy_data.csv", package = "metaboprep"), header=T, row.names = 1) |> as.matrix() samples <- read.csv(system.file("extdata", "dummy_samples.csv", package = "metaboprep"), header=T, row.names = 1) features <- read.csv(system.file("extdata", "dummy_features.csv", package = "metaboprep"), header=T, row.names = 1) # create object mydata <- Metaboprep(data = data, samples = samples, features = features) ``` ## Summary of Metaboprep object ```{r metaboprep_summary} summary(mydata) ``` ## Run sample summary ```{r sample_summary1} # note that for illustrative purposes we are using a log outlier unit distance of 1.0 here, in practice we tend to favor a value of 5.0. sample_sum1 <- sample_summary(metaboprep = mydata, source_layer = "input", outlier_udist = 1.0, output = "data.frame") ``` ## Table of sample summary ```{r sample_sum1_table, echo=FALSE} sample_sum1 |> head(n = 10) |> knitr::kable( digits = 3, row.names = FALSE, align = "c") |> kableExtra::kable_styling(full_width = TRUE) ``` ## Run sample summary on subset Using the `sample_ids` and `feature_ids` arguments you can run the summary for a subset of the data. Note: all rows will be return, however summary data will only be returned for the specified ids. ```{r sample_summary_subset} ## define a vector of sample IDs sids <- mydata@samples[mydata@samples$sex == "female", "sample_id"] ## define a vector of feature IDs fids <- mydata@features[, "feature_id"] |> sample(10) # run sample summary on subset sample_sum_subset <- sample_summary(metaboprep = mydata, source_layer = "input", outlier_udist = 1.0, sample_ids = sids, feature_ids = fids, output = "data.frame") ``` ## Table of sample summary on subset ```{r sample_sum_subset_table, echo=FALSE} sample_sum_subset |> na.omit() |> head(n = 10) |> knitr::kable( digits = 3, row.names = FALSE, align = "c") |> kableExtra::kable_styling(full_width = TRUE) ``` ## Run PCA analysis `pc_and_outliers()` performs principal component analysis. Missing data is imputed to the median and used to identify the number of informative or 'significant' PCs by (1) an acceleration analysis, and (2) a parallel analysis. Finally the number of sample outliers are determined at 3, 4, and 5 standard deviations from the mean on the top PCs as determined by the acceleration factor analysis. ```{r PC_analysis} pc_analysis <- pc_and_outliers(metaboprep = mydata, source_layer = "input", sample_ids = sids, ## It is also possible to run on a subset of samples and/or features feature_ids = NULL ) ``` ## Table of PCA analysis results Returned are the PC eigenvectors for the top 10 PCs, and outlier counts at 3, 4, and 5 standard deviations from the mean for the top two PCs ```{r PC_table, echo=FALSE} pc_analysis |> head(n = 10) |> knitr::kable( digits = 3, row.names = FALSE, align = "c") |> kableExtra::kable_styling(full_width = TRUE) ``` ## Additional attributes In addition, the variance explained vector is appended to the returned `data.frame` as and `attribute`. This can be accessed with the attribute name: `[source_layer]_varexp`, in this case we used the `input` data, therefore the attribute name is `input_varexp`. In a similar way, the results of the acceleration analysis (`input_num_pcs_scree`) and a parallel analysis (`input_num_pcs_parallel`) can also be extracted. ```{r ScreePlot, out.width="100%", fig.align='center', warning=FALSE, message=FALSE, fig.alt = "Variance explained"} library(ggplot2) # extract varexp from attributes varexp <- attr(pc_analysis, 'input_varexp') # subset to top 100 for nicer plotting if (length(varexp) > 100) varexp <- varexp[1:100] # get acceleration and parallel analysis results af <- attr(pc_analysis, 'input_num_pcs_scree') np <- attr(pc_analysis, 'input_num_pcs_parallel') if (af==np) np <- np+0.1 # make line visible if equal # as data.frame x_labs <- sub("(?i)pc","", names(varexp)) ve <- data.frame("pc" = factor(x_labs, levels=x_labs), "var_exp" = varexp) lines <- data.frame("Analysis" = c("Acceleration", "Parallel"), "pc" = c(af, np)) # plot ggplot(ve, aes(x = pc, y = var_exp)) + geom_line(color = "grey") + geom_point(shape = 21, fill = "#377EB8", size = 2) + geom_vline(data = lines, aes(xintercept = pc, color = Analysis), inherit.aes = FALSE) + scale_color_manual(values = c("Acceleration"="#E41A1C", "Parallel"="#4DAF4A")) + scale_x_discrete(labels = function(x) ifelse(seq_along(x) %% 10 == 0 | x==1, x, "")) + labs(x = "PC", y = "Variance explained") + theme_classic() + theme(legend.position = "top") ``` ## Run sample & feature summaries together ```{r sample_feature_summary} sf_sum <- summarise(metaboprep = mydata, source_layer = "input", outlier_udist = 1.0, tree_cut_height = 0.5, sample_ids = sids, ## It is also possible to run on a subset of samples and/or features feature_ids = NULL, output = "data.frame") ## two data frames are returned as a list object names(sf_sum) ``` ## Table of sample summary on subset Note that when the summarise() function is used the sample summary now includes PCA derived summary data. This is not the case when the sample_summary() function is run alone, as seen above. The reason for the difference is because the PCA data is dependent upon the feature_summary() analysis. Also, please note that when running on a subset, you are returned the full summary for all samples and features, but only the summary data for the specified subset will be populated, the rest will be `NA`. ```{r sample_sum_subset_table2, echo=FALSE} # note that for illustrative purposes we are using a log outlier unit distance of 1.0 here, in practice we tend to favor a value of 5.0. sf_sum$sample_summary |> head(n = 10) |> knitr::kable( digits = 3, row.names = FALSE, align = "c") |> kableExtra::kable_styling(full_width = TRUE) ```