Feature summary

Create Omiprep object

library(omiprep)

# import data
data     <- read.csv(system.file("extdata", "dummy_data.csv", package = "omiprep"), header=T, row.names = 1, check.names = FALSE) |> as.matrix()
samples  <- read.csv(system.file("extdata", "dummy_samples.csv", package = "omiprep"), header=T, row.names = 1)
features <- read.csv(system.file("extdata", "dummy_features.csv", package = "omiprep"), header=T, row.names = 1)
features$feature_id = as.character(features$feature_id)

# create object
mydata <- Omiprep(data = data, samples = samples, features = features)

Summary of Omiprep object

summary(mydata)
#> Omiprep Object Summary
#> --------------------------
#> Samples      : 125
#> Features     : 253
#> Data Layers  : 1
#> Layer Names  : input
#> 
#> Sample Summary Layers : none
#> Feature Summary Layers: none
#> 
#> Sample Annotation (metadata):
#>   Columns: 10
#>   Names  : sample_id, parent_sample_id, client_identifier, sex, age, bmi, LC.MS.Polar, LC.MS.Neg, LC.MS.Pos.Early, LC.MS.Pos.Late
#> 
#> Feature Annotation (metadata):
#>   Columns: 14
#>   Names  : feature_id, pathway_sortorder, biochemical, super_pathway, sub_pathway, comp_id, platform, chemical_id, ri, mass, cas, pubchem, kegg, group_hmdb
#> 
#> Exclusion Codes Summary:
#> 
#>   Sample Exclusions:
#> Exclusion | Count
#> -----------------
#> user_excluded                     | 0
#> extreme_sample_missingness        | 0
#> user_defined_sample_missingness   | 0
#> user_defined_sample_totalpeakarea | 0
#> user_defined_sample_pca_outlier   | 0
#> 
#>   Feature Exclusions:
#> Exclusion | Count
#> -----------------
#> user_excluded                    | 0
#> extreme_feature_missingness      | 0
#> user_defined_feature_missingness | 0
#> user_defined_feature_skewness    | 0

Run standard quality control

mydata = quality_control(mydata)
#> ── Starting Omics QC Process ───────────────────────────────────────────────────
#> ℹ Validating input parameters
#> ✔ Validating input parameters [7ms]
#> 
#> ℹ Sample & Feature Summary Statistics for raw data
#> ℹ Number of informative PCs (Scree acceleration factor): 2
#> ℹ Sample & Feature Summary Statistics for raw data✔ Sample & Feature Summary Statistics for raw data [2.3s]
#> 
#> ℹ Copying input data to new 'qc' data layer
#> ✔ Copying input data to new 'qc' data layer [22ms]
#> 
#> ℹ Assessing for extreme sample missingness >=80% - excluding 0 sample(s)
#> ✔ Assessing for extreme sample missingness >=80% - excluding 0 sample(s) [20ms]
#> 
#> ℹ Assessing for extreme feature missingness >=80% - excluding 0 feature(s)
#> ✔ Assessing for extreme feature missingness >=80% - excluding 5 feature(s) [21m…
#> 
#> ℹ Assessing for sample missingness at specified level of >=20% - excluding 0 sa…
#> ✔ Assessing for sample missingness at specified level of >=20% - excluding 1 sa…
#> 
#> ℹ Assessing for feature missingness at specified level of >=20% - excluding 0 f…
#> ✔ Assessing for feature missingness at specified level of >=20% - excluding 46 …
#> 
#> ℹ Calculating total sum abundance outliers at +/- 5 Sdev - excluding 0 sample(s)
#> ✔ Calculating total sum abundance outliers at +/- 5 Sdev - excluding 0 sample(s…
#> 
#> ℹ Running sample data PCA outlier analysis at +/- 5 Sdev
#> ✔ Running sample data PCA outlier analysis at +/- 5 Sdev [21ms]
#> 
#> ℹ Sample PCA outlier analysis - re-identify feature independence and PC outlier…
#> ℹ Number of informative PCs (Scree acceleration factor): 2
#> ℹ Sample PCA outlier analysis - re-identify feature independence and PC outlier…! The stated max PCs [max_num_pcs=10] to use in PCA outlier assessment is greater than the number of available informative PCs [2]
#> ℹ Sample PCA outlier analysis - re-identify feature independence and PC outlier…✔ Sample PCA outlier analysis - re-identify feature independence and PC outlier…
#> 
#> ℹ Creating final QC dataset...
#> ℹ Number of informative PCs (Scree acceleration factor): 2
#> ℹ Creating final QC dataset...
#> ℹ Creating final QC dataset...── Step timings ──
#> ℹ Creating final QC dataset...
#> ℹ Creating final QC dataset...
#>                         step seconds   pct
#>                   validation    0.00   0.0
#>                summarise_raw    2.30  40.7
#>                   copy_layer    0.00   0.0
#>   extreme_sample_missingness    0.00   0.0
#>  extreme_feature_missingness    0.00   0.0
#>           sample_missingness    0.00   0.0
#>          total_sum_abundance    0.01   0.2
#>                summarise_pca    1.53  27.0
#>              summarise_final    1.59  28.1
#>                        total    5.66 100.1
#> ✔ Creating final QC dataset... [1.6s]
#> 
#> ℹ 'Omics QC Process Completed
#> ✔ 'Omics QC Process Completed [16ms]

Feature Summary

View feature summary from the QC pipeline

# Note: the quality_control() ultimately returns the feature_summary attribute as a matrix.
df <- t( as.data.frame(mydata@feature_summary[, 1:5, "input"]) )
df <- as.data.frame( round( df , 3) )
df <- cbind(feature_id = rownames(df), df)

df |> knitr::kable( digits = 3, row.names = FALSE, align = "c") |>
  kableExtra::kable_styling(full_width = TRUE)

feature_id	missingness	outlier_count	n	mean	sd	median	min	max	range	skew	kurtosis	se	missing	var	disp_index	coef_variance	W	log10_W	k	independent_features
48719	0.248	2	94	4.877	28.121	1	0.393	256.608	256.215	8.027	66.644	2.900	31	790.803	162.154	5.766	0.128	0.564	NA	NA
43532	0.568	1	54	1.296	1.096	1	0.302	7.172	6.870	3.189	13.306	0.149	71	1.201	0.927	0.846	0.675	0.977	NA	NA
46639	0.128	4	109	2.266	3.517	1	0.078	23.696	23.619	3.379	14.104	0.337	16	12.371	5.460	1.552	0.596	0.992	1	1
606	0.000	0	125	0.990	0.211	1	0.006	1.461	1.455	-1.367	5.791	0.019	0	0.045	0.045	0.214	0.903	0.307	2	1
62279	0.168	1	104	1.315	1.146	1	0.180	7.637	7.457	2.539	8.964	0.112	21	1.313	0.998	0.871	0.759	0.997	3	1

Manually run feature summary

While feature summary is run as a part of the quality_control() function pipeline you can run the function yourself, on any layer you wish.

# NOTE:
# outlier_udist = number of IQRs from the median at which a value is flagged.
# 1.0 here is illustrative; in practice we favour 5.0, which is the default value
# for the quality_control() function.
feature_sum1 <- feature_summary(omiprep         = mydata, 
                                source_layer    = "input", 
                                outlier_udist   = 1.0,
                                tree_cut_height = 0.5,
                                output          = "data.frame", 
                                cores           = 1)

Table of feature summary

feature_sum1 |> 
  head(n = 10) |> 
  knitr::kable( digits = 3, row.names = FALSE, align = "c") |>
  kableExtra::kable_styling(full_width = FALSE)

feature_id	missingness	outlier_count	n	mean	sd	median	min	max	range	skew	kurtosis	se	missing	var	disp_index	coef_variance	W	log10_W	k	independent_features
48719	0.248	17	94	4.877	28.121	1	0.393	256.608	256.215	8.027	66.644	2.900	31	790.803	162.154	5.766	0.128	0.564	NA	NA
43532	0.568	10	54	1.296	1.096	1	0.302	7.172	6.870	3.189	13.306	0.149	71	1.201	0.927	0.846	0.675	0.977	NA	NA
46639	0.128	22	109	2.266	3.517	1	0.078	23.696	23.619	3.379	14.104	0.337	16	12.371	5.460	1.552	0.596	0.992	1	TRUE
606	0.000	27	125	0.990	0.211	1	0.006	1.461	1.455	-1.367	5.791	0.019	0	0.045	0.045	0.214	0.903	0.307	2	TRUE
62279	0.168	18	104	1.315	1.146	1	0.180	7.637	7.457	2.539	8.964	0.112	21	1.313	0.998	0.871	0.759	0.997	3	TRUE
2342	0.008	27	124	1.110	0.620	1	0.048	3.500	3.452	1.053	1.586	0.056	1	0.385	0.347	0.559	0.940	0.907	4	TRUE
53010	0.000	25	125	1.021	0.240	1	0.516	1.753	1.237	0.453	0.065	0.021	0	0.058	0.056	0.235	0.985	0.996	5	TRUE
52435	0.000	24	125	0.998	0.284	1	0.005	1.715	1.709	-0.357	1.234	0.025	0	0.081	0.081	0.285	0.979	0.440	6	TRUE
33384	0.608	11	49	7.325	19.419	1	0.248	112.333	112.085	3.911	16.323	2.774	76	377.115	51.483	2.651	0.401	0.861	NA	NA
52468	0.008	24	124	1.042	0.402	1	0.005	2.519	2.515	0.628	1.323	0.036	1	0.161	0.155	0.386	0.969	0.534	7	TRUE

Run feature summary on subset

Using the sample_ids and feature_ids arguments you can run the summary for a subset of the data. Note: all rows will be return, however summary data will only be returned for the specified ids.

## define a vector of sample IDs
sids <- mydata@samples[mydata@samples$sex == "female", "sample_id"] 

## define a vector of feature IDs
fids <- mydata@features[, "feature_id"] |> sample(25)

# NOTE:
# outlier_udist = number of IQRs from the median at which a value is flagged.
# 1.0 here is illustrative; in practice we favour 5.0, which is the default value
# for the quality_control() function.
feature_sum_subset <- feature_summary(omiprep         = mydata, 
                                      source_layer    = "input", 
                                      outlier_udist   = 1.0,
                                      tree_cut_height = 0.5,
                                      sample_ids      = sids,
                                      feature_ids     = fids,
                                      output          = "data.frame",
                                      cores           = 1)

Table of feature summary for subset

feature_sum_subset |> 
  na.omit() |>
  knitr::kable( digits = 3, row.names = FALSE, align = "c") |>
  kableExtra::kable_styling(full_width = FALSE) |>
  kableExtra::scroll_box(width = "100%", height = "500px")

feature_id	missingness	outlier_count	n	mean	sd	median	min	max	range	skew	kurtosis	se	missing	var	disp_index	coef_variance	W	log10_W	k	independent_features
62279	0.154	10	55	1.296	1.002	0.964	0.207	4.580	4.374	1.321	1.358	0.135	10	1.004	0.775	0.773	0.865	0.985	10	TRUE
48433	0.031	13	63	1.241	0.944	0.967	0.238	5.642	5.405	2.692	8.242	0.119	2	0.891	0.718	0.761	0.694	0.965	15	TRUE
46701	0.000	15	65	1.208	0.650	1.096	0.197	3.359	3.163	0.895	0.784	0.081	0	0.423	0.350	0.538	0.947	0.974	2	TRUE
47011	0.077	11	60	1.610	1.483	1.099	0.211	8.781	8.570	2.495	7.994	0.191	5	2.199	1.366	0.921	0.741	0.994	16	TRUE
48698	0.000	10	65	1.163	0.605	1.077	0.207	3.391	3.184	0.940	1.233	0.075	0	0.366	0.315	0.520	0.943	0.983	8	TRUE
40499	0.062	16	61	1.220	0.715	1.004	0.345	3.933	3.588	1.702	2.966	0.092	4	0.511	0.419	0.586	0.828	0.978	11	TRUE
37196	0.015	11	64	1.548	2.283	0.868	0.251	14.813	14.562	4.056	18.334	0.285	1	5.213	3.367	1.475	0.489	0.933	9	TRUE
46904	0.062	12	61	1.206	0.721	1.046	0.003	4.089	4.086	1.415	2.893	0.092	4	0.520	0.431	0.598	0.900	0.679	14	FALSE
46695	0.000	10	65	1.322	0.960	1.030	0.399	5.644	5.245	2.445	6.843	0.119	0	0.921	0.696	0.726	0.721	0.952	13	TRUE
18368	0.031	17	63	1.144	0.796	0.980	0.151	4.206	4.055	1.572	2.666	0.100	2	0.634	0.555	0.696	0.853	0.977	12	TRUE
46618	0.015	12	64	1.141	0.560	1.059	0.008	2.534	2.526	0.476	-0.250	0.070	1	0.314	0.275	0.491	0.977	0.715	14	TRUE
46799	0.000	9	65	1.150	0.562	1.052	0.006	2.807	2.801	0.733	0.321	0.070	0	0.316	0.275	0.489	0.957	0.669	7	TRUE
38637	0.000	9	65	1.408	1.049	1.214	0.026	4.725	4.698	1.065	0.781	0.130	0	1.101	0.782	0.745	0.915	0.906	3	TRUE
18467	0.015	10	64	1.320	0.852	1.087	0.023	5.437	5.414	2.049	6.660	0.106	1	0.726	0.550	0.645	0.835	0.843	6	TRUE
21127	0.015	11	64	1.430	1.149	1.053	0.050	5.555	5.506	1.737	2.970	0.144	1	1.320	0.923	0.803	0.814	0.961	5	TRUE
46518	0.154	13	55	1.068	0.605	0.987	0.010	3.902	3.892	2.373	8.435	0.082	10	0.366	0.343	0.566	0.782	0.674	1	TRUE
22202	0.015	12	64	1.830	2.964	0.761	0.036	12.172	12.135	2.327	4.425	0.371	1	8.788	4.802	1.620	0.607	0.961	4	TRUE
42092	0.031	13	63	1.009	0.337	1.005	0.009	2.092	2.082	0.312	1.191	0.042	2	0.114	0.113	0.334	0.977	0.521	14	FALSE

Additional feature_summary() attributes

## The attributes include column names, row names, and class for the feature summary table
## as well as a hierarchical cluster dendrogram or `input_tree` and the parameter values for 
## outlier_udist and input_tree_cut_height passed to the function. 
names( attributes(feature_sum1) )
#> [1] "names"                 "row.names"             "class"                
#> [4] "input_tree"            "input_outlier_udist"   "input_tree_cut_height"

hierarchical cluster dendrogram

In addition to the summary data, the hierarchical cluster dendrogram is appended to the returned data.frame as and attribute. This can be accessed with the attribute name: [source_layer]_tree, in this case we summarised the input data, therefore the attribute name is input_tree.

suppressPackageStartupMessages(library(dendextend))

## number of independent features
indfeatcount = sum( feature_sum1$independent_features, na.rm = TRUE )

# extract tree from attributes
tree <- attr(feature_sum1, 'input_tree')
dend <- stats::as.dendrogram(tree)

# color the independent features blue
metab_color       <- feature_sum1[, c("feature_id", "independent_features")]
metab_color       <- metab_color[match(labels(dend), metab_color$feature_id), ]
metab_color$color <- ifelse(metab_color$independent_features==TRUE, "#084594", "grey80")

# format dendrogram for ploting
dend <- dend |>
  dendextend::set("labels_cex", 0.75) |>
  dendextend::set("labels_col", metab_color$color) |>
  dendextend::set("branches_lwd", 1) |>
  dendextend::set("branches_k_color",  value = metab_color$color)

## plot the dendrogram
dend |> plot(main  = paste0("Feature clustering dendrogram\n# of ind. features = ",indfeatcount ))
abline(h = 0.5, col = "#E41A1C", lwd = 1.5)

Decision tree showing feature importance in dataset

Run sample & feature summaries together

# NOTE:
# outlier_udist = number of IQRs from the median at which a value is flagged.
# 1.0 here is illustrative; in practice we favour 5.0, which is the default value
# for the quality_control() function.
sf_sum <- summarise(omiprep         = mydata, 
                    source_layer    = "input", 
                    outlier_udist   = 1.0,
                    tree_cut_height = 0.5,
                    sample_ids      = sids, ## It is also possible to run on a subset of samples and/or features
                    feature_ids     = fids,
                    output          = "data.frame", 
                    cores           = 1)
#> ℹ Number of informative PCs (Scree acceleration factor): 2

Table of feature summary from summarise() function

sf_sum$feature_summary |> 
  na.omit() |>
  knitr::kable( digits = 3, row.names = FALSE, align = "c") |>
  kableExtra::kable_styling(full_width = TRUE) |>
  kableExtra::scroll_box(width = "100%", height = "500px")

feature_id	missingness	outlier_count	n	mean	sd	median	min	max	range	skew	kurtosis	se	missing	var	disp_index	coef_variance	W	log10_W	k	independent_features
62279	0.154	10	55	1.296	1.002	0.964	0.207	4.580	4.374	1.321	1.358	0.135	10	1.004	0.775	0.773	0.865	0.985	10	TRUE
48433	0.031	13	63	1.241	0.944	0.967	0.238	5.642	5.405	2.692	8.242	0.119	2	0.891	0.718	0.761	0.694	0.965	15	TRUE
46701	0.000	15	65	1.208	0.650	1.096	0.197	3.359	3.163	0.895	0.784	0.081	0	0.423	0.350	0.538	0.947	0.974	2	TRUE
47011	0.077	11	60	1.610	1.483	1.099	0.211	8.781	8.570	2.495	7.994	0.191	5	2.199	1.366	0.921	0.741	0.994	16	TRUE
48698	0.000	10	65	1.163	0.605	1.077	0.207	3.391	3.184	0.940	1.233	0.075	0	0.366	0.315	0.520	0.943	0.983	8	TRUE
40499	0.062	16	61	1.220	0.715	1.004	0.345	3.933	3.588	1.702	2.966	0.092	4	0.511	0.419	0.586	0.828	0.978	11	TRUE
37196	0.015	11	64	1.548	2.283	0.868	0.251	14.813	14.562	4.056	18.334	0.285	1	5.213	3.367	1.475	0.489	0.933	9	TRUE
46904	0.062	12	61	1.206	0.721	1.046	0.003	4.089	4.086	1.415	2.893	0.092	4	0.520	0.431	0.598	0.900	0.679	14	FALSE
46695	0.000	10	65	1.322	0.960	1.030	0.399	5.644	5.245	2.445	6.843	0.119	0	0.921	0.696	0.726	0.721	0.952	13	TRUE
18368	0.031	17	63	1.144	0.796	0.980	0.151	4.206	4.055	1.572	2.666	0.100	2	0.634	0.555	0.696	0.853	0.977	12	TRUE
46618	0.015	12	64	1.141	0.560	1.059	0.008	2.534	2.526	0.476	-0.250	0.070	1	0.314	0.275	0.491	0.977	0.715	14	TRUE
46799	0.000	9	65	1.150	0.562	1.052	0.006	2.807	2.801	0.733	0.321	0.070	0	0.316	0.275	0.489	0.957	0.669	7	TRUE
38637	0.000	9	65	1.408	1.049	1.214	0.026	4.725	4.698	1.065	0.781	0.130	0	1.101	0.782	0.745	0.915	0.906	3	TRUE
18467	0.015	10	64	1.320	0.852	1.087	0.023	5.437	5.414	2.049	6.660	0.106	1	0.726	0.550	0.645	0.835	0.843	6	TRUE
21127	0.015	11	64	1.430	1.149	1.053	0.050	5.555	5.506	1.737	2.970	0.144	1	1.320	0.923	0.803	0.814	0.961	5	TRUE
46518	0.154	13	55	1.068	0.605	0.987	0.010	3.902	3.892	2.373	8.435	0.082	10	0.366	0.343	0.566	0.782	0.674	1	TRUE
22202	0.015	12	64	1.830	2.964	0.761	0.036	12.172	12.135	2.327	4.425	0.371	1	8.788	4.802	1.620	0.607	0.961	4	TRUE
42092	0.031	13	63	1.009	0.337	1.005	0.009	2.092	2.082	0.312	1.191	0.042	2	0.114	0.113	0.334	0.977	0.521	14	FALSE