Skip to content

Instantly share code, notes, and snippets.

@nickp60
Last active October 23, 2023 20:54
Show Gist options
  • Select an option

  • Save nickp60/8428d4a850db09df2e67ef1bf050f794 to your computer and use it in GitHub Desktop.

Select an option

Save nickp60/8428d4a850db09df2e67ef1bf050f794 to your computer and use it in GitHub Desktop.
# Source this for a handy discrete color palette and various helper functions for microbiome analyses.
# not complex enough for an actual package. For now... #featurecreep
mycolors=c(
"#511087", "#4ecdc4", "#ffd447", "#13c01f", "#b78900", "#e66d17",
"#8e3400", "#729b79", "#bf0000", "#907ad6", "#3f6356", "#ffbfb7",
"#92bccc", "#c0e8ab", "#890466", "#382438", "#dad2bc", "#0048ff",
"#f5f749", "#146d25", "#c08552", "#fc60a8", "#89fc00", "#3df8ff",
"#ef2917", "#101f75", "#bfcccc"
)
stripplot_theme <- theme(
axis.text.x = element_text(angle=45, hjust=1),
panel.border = element_blank(),
panel.background = element_blank(),
strip.background = element_rect(size=1, fill = "grey90", color="black"),
axis.line = element_line(size=1),
axis.line.y.right = element_line(color="green", linewidth=5),
axis.line.x.top = element_blank(),
)
############################### Differential Abundance Helpers
############# metagenomeSeq
run_metagenomeSeq <- function(cmetadata, matrx, tax, formula_string, feature_of_interest, taxlevel="genus",
filename_base="metagenomeSeq_da_analysis", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1){
require(metagenomeSeq)
init_samples_of_interest <-cmetadata %>% filter(rownames(.) %in% rownames(matrx)) %>% rownames()
ASV_table <- t(matrx[rownames(matrx) %in% init_samples_of_interest, ])
#feature_data <- AnnotatedDataFrame(data.frame("ASV"=rownames(ASV_table), row.names = rownames(ASV_table)))
test_obj<- newMRexperiment(
counts = ASV_table[, init_samples_of_interest],
phenoData = AnnotatedDataFrame(cmetadata[init_samples_of_interest, ]),
featureData = AnnotatedDataFrame(as.data.frame(as.data.frame(tax) %>% mutate(ASV=rownames(.))))[rownames(ASV_table)]#feature_data[rownames(filtered_counts), ]
)
print(paste("aggregating at", taxlevel))
test_obj_attax <- aggregateByTaxonomy(test_obj, lvl = taxlevel)
print(str_glue("filtering input data to features over {prev_thresh*100}% prevalence; please perform any abundance filtering prior to execution"))
test_obj_filtered_attax <- filterData(test_obj_attax, present=nrow(test_obj@phenoData@data) * prev_thresh)
test_obj_filtered_attax <- filterData(test_obj_filtered_attax, depth = 1)
samples_of_interest <- colnames(MRcounts(test_obj_filtered_attax))[colSums(MRcounts(test_obj_filtered_attax))!=0]
print(str_glue("initial number of samples: {length(init_samples_of_interest)}; filtered: {length(samples_of_interest)}"))
print(str_glue("initial number of features: {length(rownames(test_obj))}; filtered: {length(rownames(test_obj_filtered_attax))}"))
print("Calculating normalization")
p <- cumNormStat(test_obj_filtered_attax, pFlag = T)
test_obj_filtered_norm <- cumNorm(test_obj_filtered_attax, p=p)
mod_class <- model.matrix(
formula(str_glue("~ {formula_string}")),
data=pData(test_obj_filtered_norm)
)
print("running fitZig; if this errors, consider checking for and removing NAs in metadata")
regres_class <- fitZig(test_obj_filtered_norm, mod_class)
coefs_class <- MRcoefs(
regres_class,
coef = grepl(feature_of_interest, colnames(mod_class)),
group = 3, number = 50)
res_table_class <- MRfulltable(regres_class, number = length(rownames(ASV_table)) )
print(str_glue("writing output table to {paste0(filename_base, '.tab')}"))
write.table(res_table_class, file=paste0(filename_base, ".tab"), quote=F, sep="\t", col.names = NA)
if (is.numeric(cmetadata[, feature_of_interest])){
pmetagenomeseq <- ggplot(
coefs_class %>% filter(adjPvalues < pval_thresh) %>% rownames_to_column("taxa") %>%
filter(abs(get(feature_of_interest)) > abs_coef_thresh),
aes(y=reorder(taxa, adjPvalues), x=get(feature_of_interest), color=get(feature_of_interest))) +
geom_point(size=3) +
labs(
title="metagenomeSeq Enrichment Analysis",
x=feature_of_interest,
color=feature_of_interest,
y="Taxa") +
theme(legend.position = "bottom") + scale_color_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)
} else {
pmetagenomeseq <- ggplot(
coefs_class %>%
rownames_to_column("taxa") %>%
pivot_longer(cols=colnames(.)[grepl(feature_of_interest, colnames(.))],
names_to = "feature", values_to = "val") %>%
filter(adjPvalues < pval_thresh) %>%
filter(abs(val) > abs_coef_thresh),
aes(y=reorder(taxa, adjPvalues), x=val, color=feature)) +
geom_point(size=3) +
scale_color_discrete(guide="none") +
facet_wrap(~feature) +
labs(
title="metagenomeSeq Enrichment Analysis",
x="Coefficient ",
color=feature_of_interest,
y="Taxa") +
theme(legend.position = "bottom")
}
print(str_glue("writing output figure to {paste0(filename_base, '.png')}"))
ggsave(pmetagenomeseq, filename = paste0(filename_base, ".png"), width=7, height=10)
}
########### Corncob
run_corncob <- function(physeq, formula_string, feature_of_interest, prev_thresh=.05, taxlevel="genus", filename_base="corncob_da_analysis"){
require(corncob)
print(str_glue("aggregating taxa at {taxlevel} level"))
dat <- tax_glom(physeq, taxlevel)
dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)
print("constructing model null formula:")
null_formula_string = gsub("\\+\\+", "\\+", gsub("\\+$", "", gsub("^\\+", "",gsub(feature_of_interest , "", formula_string))))
print(null_formula_string)
print("running corncob")
da_analysis <- differentialTest(
#try_only = "asv_1",
formula = formula(str_glue("~{formula_string}")),
formula_null = formula(str_glue("~{null_formula_string}")),
phi.formula = formula(str_glue("~{formula_string}")),
phi.formula_null = formula(str_glue("~{null_formula_string}")),
test = "Wald", boot = FALSE,
data = dat,
fdr_cutoff = 0.05, verbose=TRUE)
print("saving corncob object")
save(da_analysis, file = paste0(filename_base, ".Rdata"))
png(filename = paste0(filename_base, ".png"), width = 9, height = 5, units = "in", res = 300)
plot(da_analysis)
dev.off()
}
##################### Ancom
run_Ancom <- function(physeq, formula_string,rand_formula_string=NULL, feature_of_interest, taxlevel="genus",
filename_base="ancom", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1, group_var="intensity_update"){
print(str_glue("Loading ANCOM code"))
# changed struture
# source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/scripts/ancom_v2.1.R")
source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/8abbfb67ed1cb9ae366ab6349e0ad8f017ca6512/scripts/ancom_v2.1.R")
print(str_glue("Data preprocessing"))
print(paste("aggregating at", taxlevel))
dat <- tax_glom(physeq, taxlevel)
prepro = feature_table_pre_process(
feature_table = otu_table(dat) %>% data.frame() %>% t(),
meta_data=sample_data(dat) %>%data.frame() %>% rownames_to_column("sampleid"),
sample_var="sampleid", group_var=group_var,
out_cut= 0.05, zero_cut= 0.90, lib_cut = 0, neg_lb=TRUE)
feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info
# Step 2: ANCOM
print(str_glue("Running ANCOM"))
t_start = Sys.time()
res = ANCOM(
feature_table = prepro$feature_table,
meta_data = prepro$meta_data,
struc_zero=prepro$structure_zeros ,# Structural zero info,
main_var = feature_of_interest,
p_adj_method="BH",
alpha = 0.05,
adj_formula=formula_string,
rand_formula=rand_formula_string,
control = lmeControl(maxIter = 100, msMaxIter = 100, opt = "optim"))
t_end = Sys.time()
t_run = t_end - t_start
print(str_glue("Done running ANCOM: took {t_run} "))
write_csv(res$out, str_glue("{filename_base}.csv"))
# Step 3: Volcano Plot
# Number of taxa except structural zeros
n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0))
# Cutoff values for declaring differentially abundant taxa
cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1))
names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6")
# Annotation data
dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.9"], label = "W[0.9]")
fig = res$fig +
geom_hline(yintercept = cut_off["detected_0.7"], linetype = "dashed") +
geom_text(data = dat_ann, aes(x = x, y = y, label = label),
size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE)
fig
}
run_ANCOMBC <- function(physeq, formula_string, feature_of_interest, group_var, prev_thresh=.05, taxlevel="genus", filename_base="ancombc_da_analysis"){
require(ANCOMBC)
if(!any(rownames(tax) %in% colnames(matrx))){
stop(" rownames of the tax object must be present in the colnames of the matrx")
}
print(str_glue("aggregating taxa at {taxlevel} level"))
dat <- tax_glom(physeq, taxlevel)
print(str_glue("prevalence filtering"))
dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)
print("running ANCOM-BC")
ancombc_obj = ancombc(
phyloseq = dat,
formula = formula_string,
p_adj_method = "holm",
zero_cut = 1-prev_thresh,
lib_cut = 0,
group = group_var,
struc_zero = TRUE,
neg_lb = TRUE,
tol = 1e-5,
max_iter = 100,
conserve = TRUE,
alpha = 0.05,
global = TRUE)
print("saving ANCOMBC object")
save(ancombc_obj, file = paste0(filename_base, ".Rdata"))
write.csv(
rbind(
ancombc_obj$res$beta %>% mutate(tax=rownames(.), metric="coef"),
ancombc_obj$res$se %>% mutate(tax=rownames(.), metric="stderr"),
ancombc_obj$res$q_val %>% mutate(tax=rownames(.), metric="qval")
) %>% left_join(., tax_table(dat) %>% as.data.frame() %>% select(genus) %>% rownames_to_column("tax") %>% distinct()) %>%
select(-tax) %>%
rename("tax"="genus") %>%
pivot_longer(-c(tax, metric), names_to = "comparison", values_to="value") ,
file=paste0(filename_base, ".csv"),
row.names = FALSE)
}
##################3 Agggregation
aggregate_DA_runs <- function(maaslin_path, metagenomeSeq_path, ancom_path, ancombc_path, corncob_rdata, out_prefix, feature_of_interest="regimen_coll_upd"){
maaslin_tax <- read.csv(maaslin_path, sep="\t") %>%
filter(metadata=="regimen_coll_upd") %>% filter(qval < .05)
mg_tax <- read.csv(metagenomeSeq_path, sep="\t") %>% filter(adjPvalues < .05)
ancom_tax <- read.csv(ancom_path, sep=",")
ancombc_tax <- read.csv(ancombc_path, sep=",") %>% pivot_wider(names_from=metric, values_from = value) %>%
filter(grepl(feature_of_interest, comparison)) %>% filter(qval < .05)
load(corncob_rdata, verbose = TRUE) # loads da_analysis
# from https://github.com/bryandmartin/corncob/issues/93
extractCoefficients <- function(diftest_res, taxa){
#add names to all_models list
names(diftest_res$all_models) <- taxa_names(diftest_res$data)
# default is all taxa
if (missing(taxa)) {
taxa <- taxa_names(diftest_res$data)
} else {
taxa <- taxa
}
purrr::map(
diftest_res$all_models[taxa],
~ stats::coef(.)
)
}
# note that the da_analysis$significant_taxa include significance both in terms of dispersion and a
sig_taxa_coefs <- purrr::map_dfr(
.x = da_analysis$significant_taxa,
function(x){
as.data.frame(extractCoefficients(da_analysis, taxa=x)[[1]] ) %>%
rownames_to_column("comparison") %>%
mutate(asv_key=x,
genus= corncob::otu_to_taxonomy(OTU = x, data = da_analysis$data, level = "genus"),
tool="corncob") %>%
rename(coef=Estimate, stderr=`Std. Error`, adjPvalues=`Pr(>|t|)`) %>%
select(-`t value`) %>%
filter(grepl(paste0("mu.",feature_of_interest), comparison)) %>%
mutate(comparison = gsub(paste0("mu.",feature_of_interest), "", comparison)) %>%
filter(adjPvalues < .05) %>%
select(-asv_key)
}
)
venn_data <- list(
"MaAslin2"=unique(maaslin_tax$feature),
"metagenomeSeq"=unique(mg_tax$X),
"ANCOM-II"=unique(ancom_tax$taxa_id),
"ANCOM-BC"=unique(ancombc_tax$tax),
"corncob"= unique(sig_taxa_coefs$genus)
)
venn_data
(
p_venn <- ggvenn::ggvenn(data=venn_data, show_percentage = FALSE, #show_elements = TRUE,text_size = 1,
fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#00FF00FF"),
stroke_size = 0.5, set_name_size = 4
)
)
dir.create("results", showWarnings = FALSE)
ggsave(filename = paste0(out_prefix, "_tools_venn.pdf"), width=4, height=4, plot = p_venn)
p_upset <- ggplot(do.call(rbind,lapply(venn_data,data.frame)) %>%
mutate(tool=gsub("(.*)\\..*", "\\1", rownames(.))) %>%
remove_rownames() %>%
distinct() %>%
rename(genus=1) %>% group_by(genus) %>%
summarize(tools = list(tool)), aes(x=tools)) + geom_bar() +
scale_x_upset(reverse = FALSE)
ggsave(filename = paste0(out_prefix, "_tools_upset.pdf"), width=4, height=4, plot = p_venn)
results_table <- rbind(
maaslin_tax %>% select(feature, coef, value, stderr, qval) %>%
rename(genus=feature, adjPvalues=qval, comparison=value) %>% mutate(tool="MaAsLin2") %>%
mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
mg_tax %>% select(X, any_of(grep(feature_of_interest, colnames(.), value = TRUE)), adjPvalues) %>%
mutate(stderr=NA, tool="metagenomeSeq") %>%
pivot_longer(cols = grep(feature_of_interest, colnames(.) , value=TRUE), names_to = "comparison", values_to="coef") %>%
rename(genus=X) %>%
mutate(comparison=gsub("[^[:alnum:] ]", "_",gsub(feature_of_interest, "", comparison))),
sig_taxa_coefs %>%
mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
ancom_tax %>% rename(genus=taxa_id) %>%
select(genus) %>%
mutate( coef =NA, stderr=NA, comparison=NA, adjPvalues=NA, tool="ANCOM-II"),
ancombc_tax %>% select(tax, coef, comparison, stderr, qval) %>%
rename(genus=tax, adjPvalues=qval) %>% mutate(tool="ANCOM-BC") %>%
mutate(comparison=gsub("[^[:alnum:] ]", "_", gsub(feature_of_interest, "", comparison)))
) %>%
group_by(genus, comparison) %>%
mutate(n_tools=length(unique(tool))) %>%
ungroup() %>%
arrange(desc(n_tools), coef)
knitr::kable(results_table)
write_excel_csv(results_table, file = paste0(out_prefix, "_enrichment_table.csv"))
p_coef <- ggplot(results_table %>% group_by(tool, comparison, sign(coef)) %>%
mutate(relcoef = sign(coef)*(abs(coef)/max(abs(coef)))) %>%
group_by(genus, comparison, tool, sign(coef), adjPvalues, n_tools) %>%
summarize(maxrelcoef = sign(coef) * max(abs(relcoef))) %>%
distinct() %>% ungroup() %>% group_by(tool, genus) %>%
filter(abs(maxrelcoef) == max(abs(maxrelcoef))) ,
aes(x=tool, y=reorder(genus, n_tools), fill=maxrelcoef, alpha=adjPvalues)) +
geom_point(aes(size=adjPvalues),
stroke=1, color="black", shape=21) +
scale_y_discrete(expand = c(.05, .05)) +
scale_color_manual(values = c("white", "black")) +
scale_fill_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)+
scale_alpha(range=c(.9, .1)) +
scale_size(range=c(5, .1)) +
theme(panel.grid.major.y = element_line(color="grey70", size=.1),
axis.text.y = element_text(face="italic"),
axis.text.x = element_text(angle=45, hjust=1)) +
labs(title="", y="Genus", x="Tool", fill = "Realtive Coefficent\n(per tool)", alpha="Corrected P-value", size="Corrected P-value")
if (!length(unique(results_table$comparison)) == 1){
ggsave(p_coef + facet_wrap(~comparison, scales = "free_y"), filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=10, height=7)
} else{
ggsave(p_coef, filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=6, height=5)
}
return(list(table=results_table, venn=venn_data))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment