Last active
October 23, 2023 20:54
-
-
Save nickp60/8428d4a850db09df2e67ef1bf050f794 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Source this for a handy discrete color palette and various helper functions for microbiome analyses. | |
| # not complex enough for an actual package. For now... #featurecreep | |
| mycolors=c( | |
| "#511087", "#4ecdc4", "#ffd447", "#13c01f", "#b78900", "#e66d17", | |
| "#8e3400", "#729b79", "#bf0000", "#907ad6", "#3f6356", "#ffbfb7", | |
| "#92bccc", "#c0e8ab", "#890466", "#382438", "#dad2bc", "#0048ff", | |
| "#f5f749", "#146d25", "#c08552", "#fc60a8", "#89fc00", "#3df8ff", | |
| "#ef2917", "#101f75", "#bfcccc" | |
| ) | |
| stripplot_theme <- theme( | |
| axis.text.x = element_text(angle=45, hjust=1), | |
| panel.border = element_blank(), | |
| panel.background = element_blank(), | |
| strip.background = element_rect(size=1, fill = "grey90", color="black"), | |
| axis.line = element_line(size=1), | |
| axis.line.y.right = element_line(color="green", linewidth=5), | |
| axis.line.x.top = element_blank(), | |
| ) | |
| ############################### Differential Abundance Helpers | |
| ############# metagenomeSeq | |
| run_metagenomeSeq <- function(cmetadata, matrx, tax, formula_string, feature_of_interest, taxlevel="genus", | |
| filename_base="metagenomeSeq_da_analysis", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1){ | |
| require(metagenomeSeq) | |
| init_samples_of_interest <-cmetadata %>% filter(rownames(.) %in% rownames(matrx)) %>% rownames() | |
| ASV_table <- t(matrx[rownames(matrx) %in% init_samples_of_interest, ]) | |
| #feature_data <- AnnotatedDataFrame(data.frame("ASV"=rownames(ASV_table), row.names = rownames(ASV_table))) | |
| test_obj<- newMRexperiment( | |
| counts = ASV_table[, init_samples_of_interest], | |
| phenoData = AnnotatedDataFrame(cmetadata[init_samples_of_interest, ]), | |
| featureData = AnnotatedDataFrame(as.data.frame(as.data.frame(tax) %>% mutate(ASV=rownames(.))))[rownames(ASV_table)]#feature_data[rownames(filtered_counts), ] | |
| ) | |
| print(paste("aggregating at", taxlevel)) | |
| test_obj_attax <- aggregateByTaxonomy(test_obj, lvl = taxlevel) | |
| print(str_glue("filtering input data to features over {prev_thresh*100}% prevalence; please perform any abundance filtering prior to execution")) | |
| test_obj_filtered_attax <- filterData(test_obj_attax, present=nrow(test_obj@phenoData@data) * prev_thresh) | |
| test_obj_filtered_attax <- filterData(test_obj_filtered_attax, depth = 1) | |
| samples_of_interest <- colnames(MRcounts(test_obj_filtered_attax))[colSums(MRcounts(test_obj_filtered_attax))!=0] | |
| print(str_glue("initial number of samples: {length(init_samples_of_interest)}; filtered: {length(samples_of_interest)}")) | |
| print(str_glue("initial number of features: {length(rownames(test_obj))}; filtered: {length(rownames(test_obj_filtered_attax))}")) | |
| print("Calculating normalization") | |
| p <- cumNormStat(test_obj_filtered_attax, pFlag = T) | |
| test_obj_filtered_norm <- cumNorm(test_obj_filtered_attax, p=p) | |
| mod_class <- model.matrix( | |
| formula(str_glue("~ {formula_string}")), | |
| data=pData(test_obj_filtered_norm) | |
| ) | |
| print("running fitZig; if this errors, consider checking for and removing NAs in metadata") | |
| regres_class <- fitZig(test_obj_filtered_norm, mod_class) | |
| coefs_class <- MRcoefs( | |
| regres_class, | |
| coef = grepl(feature_of_interest, colnames(mod_class)), | |
| group = 3, number = 50) | |
| res_table_class <- MRfulltable(regres_class, number = length(rownames(ASV_table)) ) | |
| print(str_glue("writing output table to {paste0(filename_base, '.tab')}")) | |
| write.table(res_table_class, file=paste0(filename_base, ".tab"), quote=F, sep="\t", col.names = NA) | |
| if (is.numeric(cmetadata[, feature_of_interest])){ | |
| pmetagenomeseq <- ggplot( | |
| coefs_class %>% filter(adjPvalues < pval_thresh) %>% rownames_to_column("taxa") %>% | |
| filter(abs(get(feature_of_interest)) > abs_coef_thresh), | |
| aes(y=reorder(taxa, adjPvalues), x=get(feature_of_interest), color=get(feature_of_interest))) + | |
| geom_point(size=3) + | |
| labs( | |
| title="metagenomeSeq Enrichment Analysis", | |
| x=feature_of_interest, | |
| color=feature_of_interest, | |
| y="Taxa") + | |
| theme(legend.position = "bottom") + scale_color_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0) | |
| } else { | |
| pmetagenomeseq <- ggplot( | |
| coefs_class %>% | |
| rownames_to_column("taxa") %>% | |
| pivot_longer(cols=colnames(.)[grepl(feature_of_interest, colnames(.))], | |
| names_to = "feature", values_to = "val") %>% | |
| filter(adjPvalues < pval_thresh) %>% | |
| filter(abs(val) > abs_coef_thresh), | |
| aes(y=reorder(taxa, adjPvalues), x=val, color=feature)) + | |
| geom_point(size=3) + | |
| scale_color_discrete(guide="none") + | |
| facet_wrap(~feature) + | |
| labs( | |
| title="metagenomeSeq Enrichment Analysis", | |
| x="Coefficient ", | |
| color=feature_of_interest, | |
| y="Taxa") + | |
| theme(legend.position = "bottom") | |
| } | |
| print(str_glue("writing output figure to {paste0(filename_base, '.png')}")) | |
| ggsave(pmetagenomeseq, filename = paste0(filename_base, ".png"), width=7, height=10) | |
| } | |
| ########### Corncob | |
| run_corncob <- function(physeq, formula_string, feature_of_interest, prev_thresh=.05, taxlevel="genus", filename_base="corncob_da_analysis"){ | |
| require(corncob) | |
| print(str_glue("aggregating taxa at {taxlevel} level")) | |
| dat <- tax_glom(physeq, taxlevel) | |
| dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE) | |
| print("constructing model null formula:") | |
| null_formula_string = gsub("\\+\\+", "\\+", gsub("\\+$", "", gsub("^\\+", "",gsub(feature_of_interest , "", formula_string)))) | |
| print(null_formula_string) | |
| print("running corncob") | |
| da_analysis <- differentialTest( | |
| #try_only = "asv_1", | |
| formula = formula(str_glue("~{formula_string}")), | |
| formula_null = formula(str_glue("~{null_formula_string}")), | |
| phi.formula = formula(str_glue("~{formula_string}")), | |
| phi.formula_null = formula(str_glue("~{null_formula_string}")), | |
| test = "Wald", boot = FALSE, | |
| data = dat, | |
| fdr_cutoff = 0.05, verbose=TRUE) | |
| print("saving corncob object") | |
| save(da_analysis, file = paste0(filename_base, ".Rdata")) | |
| png(filename = paste0(filename_base, ".png"), width = 9, height = 5, units = "in", res = 300) | |
| plot(da_analysis) | |
| dev.off() | |
| } | |
| ##################### Ancom | |
| run_Ancom <- function(physeq, formula_string,rand_formula_string=NULL, feature_of_interest, taxlevel="genus", | |
| filename_base="ancom", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1, group_var="intensity_update"){ | |
| print(str_glue("Loading ANCOM code")) | |
| # changed struture | |
| # source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/scripts/ancom_v2.1.R") | |
| source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/8abbfb67ed1cb9ae366ab6349e0ad8f017ca6512/scripts/ancom_v2.1.R") | |
| print(str_glue("Data preprocessing")) | |
| print(paste("aggregating at", taxlevel)) | |
| dat <- tax_glom(physeq, taxlevel) | |
| prepro = feature_table_pre_process( | |
| feature_table = otu_table(dat) %>% data.frame() %>% t(), | |
| meta_data=sample_data(dat) %>%data.frame() %>% rownames_to_column("sampleid"), | |
| sample_var="sampleid", group_var=group_var, | |
| out_cut= 0.05, zero_cut= 0.90, lib_cut = 0, neg_lb=TRUE) | |
| feature_table = prepro$feature_table # Preprocessed feature table | |
| meta_data = prepro$meta_data # Preprocessed metadata | |
| struc_zero = prepro$structure_zeros # Structural zero info | |
| # Step 2: ANCOM | |
| print(str_glue("Running ANCOM")) | |
| t_start = Sys.time() | |
| res = ANCOM( | |
| feature_table = prepro$feature_table, | |
| meta_data = prepro$meta_data, | |
| struc_zero=prepro$structure_zeros ,# Structural zero info, | |
| main_var = feature_of_interest, | |
| p_adj_method="BH", | |
| alpha = 0.05, | |
| adj_formula=formula_string, | |
| rand_formula=rand_formula_string, | |
| control = lmeControl(maxIter = 100, msMaxIter = 100, opt = "optim")) | |
| t_end = Sys.time() | |
| t_run = t_end - t_start | |
| print(str_glue("Done running ANCOM: took {t_run} ")) | |
| write_csv(res$out, str_glue("{filename_base}.csv")) | |
| # Step 3: Volcano Plot | |
| # Number of taxa except structural zeros | |
| n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0)) | |
| # Cutoff values for declaring differentially abundant taxa | |
| cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1)) | |
| names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6") | |
| # Annotation data | |
| dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.9"], label = "W[0.9]") | |
| fig = res$fig + | |
| geom_hline(yintercept = cut_off["detected_0.7"], linetype = "dashed") + | |
| geom_text(data = dat_ann, aes(x = x, y = y, label = label), | |
| size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE) | |
| fig | |
| } | |
| run_ANCOMBC <- function(physeq, formula_string, feature_of_interest, group_var, prev_thresh=.05, taxlevel="genus", filename_base="ancombc_da_analysis"){ | |
| require(ANCOMBC) | |
| if(!any(rownames(tax) %in% colnames(matrx))){ | |
| stop(" rownames of the tax object must be present in the colnames of the matrx") | |
| } | |
| print(str_glue("aggregating taxa at {taxlevel} level")) | |
| dat <- tax_glom(physeq, taxlevel) | |
| print(str_glue("prevalence filtering")) | |
| dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE) | |
| print("running ANCOM-BC") | |
| ancombc_obj = ancombc( | |
| phyloseq = dat, | |
| formula = formula_string, | |
| p_adj_method = "holm", | |
| zero_cut = 1-prev_thresh, | |
| lib_cut = 0, | |
| group = group_var, | |
| struc_zero = TRUE, | |
| neg_lb = TRUE, | |
| tol = 1e-5, | |
| max_iter = 100, | |
| conserve = TRUE, | |
| alpha = 0.05, | |
| global = TRUE) | |
| print("saving ANCOMBC object") | |
| save(ancombc_obj, file = paste0(filename_base, ".Rdata")) | |
| write.csv( | |
| rbind( | |
| ancombc_obj$res$beta %>% mutate(tax=rownames(.), metric="coef"), | |
| ancombc_obj$res$se %>% mutate(tax=rownames(.), metric="stderr"), | |
| ancombc_obj$res$q_val %>% mutate(tax=rownames(.), metric="qval") | |
| ) %>% left_join(., tax_table(dat) %>% as.data.frame() %>% select(genus) %>% rownames_to_column("tax") %>% distinct()) %>% | |
| select(-tax) %>% | |
| rename("tax"="genus") %>% | |
| pivot_longer(-c(tax, metric), names_to = "comparison", values_to="value") , | |
| file=paste0(filename_base, ".csv"), | |
| row.names = FALSE) | |
| } | |
| ##################3 Agggregation | |
| aggregate_DA_runs <- function(maaslin_path, metagenomeSeq_path, ancom_path, ancombc_path, corncob_rdata, out_prefix, feature_of_interest="regimen_coll_upd"){ | |
| maaslin_tax <- read.csv(maaslin_path, sep="\t") %>% | |
| filter(metadata=="regimen_coll_upd") %>% filter(qval < .05) | |
| mg_tax <- read.csv(metagenomeSeq_path, sep="\t") %>% filter(adjPvalues < .05) | |
| ancom_tax <- read.csv(ancom_path, sep=",") | |
| ancombc_tax <- read.csv(ancombc_path, sep=",") %>% pivot_wider(names_from=metric, values_from = value) %>% | |
| filter(grepl(feature_of_interest, comparison)) %>% filter(qval < .05) | |
| load(corncob_rdata, verbose = TRUE) # loads da_analysis | |
| # from https://github.com/bryandmartin/corncob/issues/93 | |
| extractCoefficients <- function(diftest_res, taxa){ | |
| #add names to all_models list | |
| names(diftest_res$all_models) <- taxa_names(diftest_res$data) | |
| # default is all taxa | |
| if (missing(taxa)) { | |
| taxa <- taxa_names(diftest_res$data) | |
| } else { | |
| taxa <- taxa | |
| } | |
| purrr::map( | |
| diftest_res$all_models[taxa], | |
| ~ stats::coef(.) | |
| ) | |
| } | |
| # note that the da_analysis$significant_taxa include significance both in terms of dispersion and a | |
| sig_taxa_coefs <- purrr::map_dfr( | |
| .x = da_analysis$significant_taxa, | |
| function(x){ | |
| as.data.frame(extractCoefficients(da_analysis, taxa=x)[[1]] ) %>% | |
| rownames_to_column("comparison") %>% | |
| mutate(asv_key=x, | |
| genus= corncob::otu_to_taxonomy(OTU = x, data = da_analysis$data, level = "genus"), | |
| tool="corncob") %>% | |
| rename(coef=Estimate, stderr=`Std. Error`, adjPvalues=`Pr(>|t|)`) %>% | |
| select(-`t value`) %>% | |
| filter(grepl(paste0("mu.",feature_of_interest), comparison)) %>% | |
| mutate(comparison = gsub(paste0("mu.",feature_of_interest), "", comparison)) %>% | |
| filter(adjPvalues < .05) %>% | |
| select(-asv_key) | |
| } | |
| ) | |
| venn_data <- list( | |
| "MaAslin2"=unique(maaslin_tax$feature), | |
| "metagenomeSeq"=unique(mg_tax$X), | |
| "ANCOM-II"=unique(ancom_tax$taxa_id), | |
| "ANCOM-BC"=unique(ancombc_tax$tax), | |
| "corncob"= unique(sig_taxa_coefs$genus) | |
| ) | |
| venn_data | |
| ( | |
| p_venn <- ggvenn::ggvenn(data=venn_data, show_percentage = FALSE, #show_elements = TRUE,text_size = 1, | |
| fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#00FF00FF"), | |
| stroke_size = 0.5, set_name_size = 4 | |
| ) | |
| ) | |
| dir.create("results", showWarnings = FALSE) | |
| ggsave(filename = paste0(out_prefix, "_tools_venn.pdf"), width=4, height=4, plot = p_venn) | |
| p_upset <- ggplot(do.call(rbind,lapply(venn_data,data.frame)) %>% | |
| mutate(tool=gsub("(.*)\\..*", "\\1", rownames(.))) %>% | |
| remove_rownames() %>% | |
| distinct() %>% | |
| rename(genus=1) %>% group_by(genus) %>% | |
| summarize(tools = list(tool)), aes(x=tools)) + geom_bar() + | |
| scale_x_upset(reverse = FALSE) | |
| ggsave(filename = paste0(out_prefix, "_tools_upset.pdf"), width=4, height=4, plot = p_venn) | |
| results_table <- rbind( | |
| maaslin_tax %>% select(feature, coef, value, stderr, qval) %>% | |
| rename(genus=feature, adjPvalues=qval, comparison=value) %>% mutate(tool="MaAsLin2") %>% | |
| mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)), | |
| mg_tax %>% select(X, any_of(grep(feature_of_interest, colnames(.), value = TRUE)), adjPvalues) %>% | |
| mutate(stderr=NA, tool="metagenomeSeq") %>% | |
| pivot_longer(cols = grep(feature_of_interest, colnames(.) , value=TRUE), names_to = "comparison", values_to="coef") %>% | |
| rename(genus=X) %>% | |
| mutate(comparison=gsub("[^[:alnum:] ]", "_",gsub(feature_of_interest, "", comparison))), | |
| sig_taxa_coefs %>% | |
| mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)), | |
| ancom_tax %>% rename(genus=taxa_id) %>% | |
| select(genus) %>% | |
| mutate( coef =NA, stderr=NA, comparison=NA, adjPvalues=NA, tool="ANCOM-II"), | |
| ancombc_tax %>% select(tax, coef, comparison, stderr, qval) %>% | |
| rename(genus=tax, adjPvalues=qval) %>% mutate(tool="ANCOM-BC") %>% | |
| mutate(comparison=gsub("[^[:alnum:] ]", "_", gsub(feature_of_interest, "", comparison))) | |
| ) %>% | |
| group_by(genus, comparison) %>% | |
| mutate(n_tools=length(unique(tool))) %>% | |
| ungroup() %>% | |
| arrange(desc(n_tools), coef) | |
| knitr::kable(results_table) | |
| write_excel_csv(results_table, file = paste0(out_prefix, "_enrichment_table.csv")) | |
| p_coef <- ggplot(results_table %>% group_by(tool, comparison, sign(coef)) %>% | |
| mutate(relcoef = sign(coef)*(abs(coef)/max(abs(coef)))) %>% | |
| group_by(genus, comparison, tool, sign(coef), adjPvalues, n_tools) %>% | |
| summarize(maxrelcoef = sign(coef) * max(abs(relcoef))) %>% | |
| distinct() %>% ungroup() %>% group_by(tool, genus) %>% | |
| filter(abs(maxrelcoef) == max(abs(maxrelcoef))) , | |
| aes(x=tool, y=reorder(genus, n_tools), fill=maxrelcoef, alpha=adjPvalues)) + | |
| geom_point(aes(size=adjPvalues), | |
| stroke=1, color="black", shape=21) + | |
| scale_y_discrete(expand = c(.05, .05)) + | |
| scale_color_manual(values = c("white", "black")) + | |
| scale_fill_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)+ | |
| scale_alpha(range=c(.9, .1)) + | |
| scale_size(range=c(5, .1)) + | |
| theme(panel.grid.major.y = element_line(color="grey70", size=.1), | |
| axis.text.y = element_text(face="italic"), | |
| axis.text.x = element_text(angle=45, hjust=1)) + | |
| labs(title="", y="Genus", x="Tool", fill = "Realtive Coefficent\n(per tool)", alpha="Corrected P-value", size="Corrected P-value") | |
| if (!length(unique(results_table$comparison)) == 1){ | |
| ggsave(p_coef + facet_wrap(~comparison, scales = "free_y"), filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=10, height=7) | |
| } else{ | |
| ggsave(p_coef, filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=6, height=5) | |
| } | |
| return(list(table=results_table, venn=venn_data)) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment