nickp60 · October 23, 2023 20:54
diff --git a/nrw-microbiome.R b/nrw-microbiome.R
 # Source this for a handy discrete color palette and various helper functions for microbiome analyses.
 # not complex enough for an actual package. For now...  #featurecreep
 mycolors=c(
  "#511087", "#4ecdc4", "#ffd447", "#13c01f", "#b78900", "#e66d17",
  "#8e3400", "#729b79", "#bf0000", "#907ad6", "#3f6356", "#ffbfb7",
  "#92bccc", "#c0e8ab", "#890466", "#382438", "#dad2bc", "#0048ff",
  "#f5f749", "#146d25", "#c08552", "#fc60a8", "#89fc00", "#3df8ff",
  "#ef2917", "#101f75", "#bfcccc"
 )

 stripplot_theme <-  theme(
  axis.text.x = element_text(angle=45, hjust=1),
  panel.border = element_blank(),
  panel.background = element_blank(),
  strip.background = element_rect(size=1, fill = "grey90", color="black"),
  axis.line = element_line(size=1),
  axis.line.y.right = element_line(color="green", linewidth=5),
  axis.line.x.top  = element_blank(),
 )


 ###############################  Differential Abundance Helpers
 ############# metagenomeSeq
 run_metagenomeSeq <- function(cmetadata, matrx, tax, formula_string, feature_of_interest, taxlevel="genus", 
                              filename_base="metagenomeSeq_da_analysis", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1){
  require(metagenomeSeq)
  init_samples_of_interest <-cmetadata %>% filter(rownames(.) %in% rownames(matrx)) %>% rownames()
  ASV_table <- t(matrx[rownames(matrx) %in% init_samples_of_interest, ])
  #feature_data <- AnnotatedDataFrame(data.frame("ASV"=rownames(ASV_table), row.names = rownames(ASV_table)))
  
  test_obj<- newMRexperiment(
    counts = ASV_table[, init_samples_of_interest], 
    phenoData = AnnotatedDataFrame(cmetadata[init_samples_of_interest, ]), 
    featureData = AnnotatedDataFrame(as.data.frame(as.data.frame(tax) %>% mutate(ASV=rownames(.))))[rownames(ASV_table)]#feature_data[rownames(filtered_counts), ]
  )
  print(paste("aggregating at", taxlevel))
  test_obj_attax <- aggregateByTaxonomy(test_obj, lvl = taxlevel)
  print(str_glue("filtering input data to features over {prev_thresh*100}% prevalence; please perform any abundance filtering prior to execution"))
  test_obj_filtered_attax <- filterData(test_obj_attax, present=nrow(test_obj@phenoData@data) * prev_thresh)
  test_obj_filtered_attax <- filterData(test_obj_filtered_attax, depth = 1)
  samples_of_interest <- colnames(MRcounts(test_obj_filtered_attax))[colSums(MRcounts(test_obj_filtered_attax))!=0]
  
  print(str_glue("initial number of samples: {length(init_samples_of_interest)}; filtered: {length(samples_of_interest)}"))
  print(str_glue("initial number of features: {length(rownames(test_obj))}; filtered: {length(rownames(test_obj_filtered_attax))}"))
  
  print("Calculating normalization")
  p <- cumNormStat(test_obj_filtered_attax, pFlag = T)
  test_obj_filtered_norm <- cumNorm(test_obj_filtered_attax, p=p)
  
  
  mod_class <- model.matrix(
    formula(str_glue("~ {formula_string}")), 
    data=pData(test_obj_filtered_norm)
  )
  print("running fitZig; if this errors, consider checking for and removing NAs in metadata")
  
  regres_class <-  fitZig(test_obj_filtered_norm, mod_class)
  
  coefs_class <- MRcoefs(
    regres_class, 
    coef = grepl(feature_of_interest, colnames(mod_class)), 
    group = 3, number = 50)
  res_table_class <- MRfulltable(regres_class, number = length(rownames(ASV_table)) )
  
  print(str_glue("writing output table to {paste0(filename_base, '.tab')}"))
  
  write.table(res_table_class, file=paste0(filename_base, ".tab"), quote=F, sep="\t", col.names = NA)
  
  
  
  if (is.numeric(cmetadata[, feature_of_interest])){
    pmetagenomeseq <- ggplot(
      coefs_class %>% filter(adjPvalues < pval_thresh) %>% rownames_to_column("taxa") %>%
        filter(abs(get(feature_of_interest)) > abs_coef_thresh),
      aes(y=reorder(taxa, adjPvalues), x=get(feature_of_interest), color=get(feature_of_interest))) + 
      geom_point(size=3)  + 
      labs(
        title="metagenomeSeq Enrichment Analysis",
        x=feature_of_interest, 
        color=feature_of_interest, 
        y="Taxa") + 
      theme(legend.position = "bottom") + scale_color_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)
  } else {
    pmetagenomeseq <- ggplot(
      coefs_class %>% 
        rownames_to_column("taxa") %>% 
        pivot_longer(cols=colnames(.)[grepl(feature_of_interest, colnames(.))], 
                     names_to = "feature", values_to = "val") %>% 
        filter(adjPvalues < pval_thresh) %>%
        filter(abs(val) > abs_coef_thresh),
      aes(y=reorder(taxa, adjPvalues), x=val, color=feature)) + 
      geom_point(size=3)  + 
      scale_color_discrete(guide="none") + 
      facet_wrap(~feature) +
      labs(
        title="metagenomeSeq Enrichment Analysis",
        x="Coefficient ", 
        color=feature_of_interest, 
        y="Taxa") + 
      theme(legend.position = "bottom")
    
  }
  
  print(str_glue("writing output figure to {paste0(filename_base, '.png')}"))
  
  ggsave(pmetagenomeseq, filename = paste0(filename_base, ".png"), width=7, height=10)
  
 }


 ########### Corncob
 run_corncob <- function(physeq, formula_string, feature_of_interest, prev_thresh=.05, taxlevel="genus", filename_base="corncob_da_analysis"){
  require(corncob)
  print(str_glue("aggregating taxa at {taxlevel} level"))
  dat <- tax_glom(physeq, taxlevel)
  
  dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)
  
  print("constructing model null formula:")  
  null_formula_string = gsub("\\+\\+", "\\+", gsub("\\+$", "", gsub("^\\+", "",gsub(feature_of_interest , "", formula_string))))
  print(null_formula_string)
  
  print("running corncob")
  da_analysis <- differentialTest(
    #try_only = "asv_1",
    formula =  formula(str_glue("~{formula_string}")),
    formula_null = formula(str_glue("~{null_formula_string}")),
    phi.formula = formula(str_glue("~{formula_string}")),
    phi.formula_null = formula(str_glue("~{null_formula_string}")),
    test = "Wald", boot = FALSE,
    data = dat,
    fdr_cutoff = 0.05, verbose=TRUE)
  print("saving corncob object")
  save(da_analysis, file = paste0(filename_base, ".Rdata"))
  
  
  png(filename = paste0(filename_base, ".png"), width = 9, height = 5, units = "in", res = 300)
  plot(da_analysis)
  dev.off()
 }





 ##################### Ancom
 run_Ancom <- function(physeq, formula_string,rand_formula_string=NULL, feature_of_interest, taxlevel="genus", 
                      filename_base="ancom", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1, group_var="intensity_update"){
  print(str_glue("Loading ANCOM code"))
  
  # changed struture
  # source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/scripts/ancom_v2.1.R")   
  source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/8abbfb67ed1cb9ae366ab6349e0ad8f017ca6512/scripts/ancom_v2.1.R")

  print(str_glue("Data preprocessing"))
  
  print(paste("aggregating at", taxlevel))
  dat <- tax_glom(physeq, taxlevel)
  
  
  prepro = feature_table_pre_process(
    feature_table = otu_table(dat) %>% data.frame() %>% t(),
    meta_data=sample_data(dat) %>%data.frame() %>% rownames_to_column("sampleid"), 
    sample_var="sampleid", group_var=group_var, 
    out_cut= 0.05, zero_cut= 0.90, lib_cut = 0, neg_lb=TRUE)
  feature_table = prepro$feature_table # Preprocessed feature table
  meta_data = prepro$meta_data # Preprocessed metadata
  struc_zero = prepro$structure_zeros # Structural zero info
  
  
  # Step 2: ANCOM
  print(str_glue("Running ANCOM"))
  t_start = Sys.time()
  res = ANCOM(
    feature_table = prepro$feature_table,
    meta_data = prepro$meta_data,
    struc_zero=prepro$structure_zeros ,# Structural zero info,
    main_var = feature_of_interest, 
    p_adj_method="BH", 
    alpha = 0.05,
    adj_formula=formula_string,
    rand_formula=rand_formula_string, 
    control = lmeControl(maxIter = 100, msMaxIter = 100, opt = "optim"))
  t_end = Sys.time()
  t_run = t_end - t_start 
  print(str_glue("Done running ANCOM: took {t_run} "))
  
  write_csv(res$out, str_glue("{filename_base}.csv"))
  
  # Step 3: Volcano Plot
  
  # Number of taxa except structural zeros
  n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0))
  # Cutoff values for declaring differentially abundant taxa
  cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1))
  names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6")
  
  # Annotation data
  dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.9"], label = "W[0.9]")
  
  fig = res$fig + 
    geom_hline(yintercept = cut_off["detected_0.7"], linetype = "dashed") + 
    geom_text(data = dat_ann, aes(x = x, y = y, label = label), 
              size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE)
  fig   
  
  
 }


 run_ANCOMBC <- function(physeq, formula_string, feature_of_interest, group_var, prev_thresh=.05, taxlevel="genus", filename_base="ancombc_da_analysis"){
  require(ANCOMBC)
  if(!any(rownames(tax) %in% colnames(matrx))){
    stop(" rownames of the tax object must be present in the colnames of the matrx")
  }
  print(str_glue("aggregating taxa at {taxlevel} level"))
  dat <- tax_glom(physeq, taxlevel)
  print(str_glue("prevalence filtering"))
  dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)
  
  print("running ANCOM-BC")
  ancombc_obj = ancombc(
    phyloseq = dat, 
    formula = formula_string,
    p_adj_method = "holm", 
    zero_cut = 1-prev_thresh, 
    lib_cut = 0, 
    group = group_var, 
    struc_zero = TRUE, 
    neg_lb = TRUE, 
    tol = 1e-5, 
    max_iter = 100, 
    conserve = TRUE, 
    alpha = 0.05,
    global = TRUE)
  
  print("saving ANCOMBC object")
  save(ancombc_obj, file = paste0(filename_base, ".Rdata"))
  write.csv(
    rbind(
      ancombc_obj$res$beta %>% mutate(tax=rownames(.), metric="coef"),
      ancombc_obj$res$se %>% mutate(tax=rownames(.), metric="stderr"),
      ancombc_obj$res$q_val %>% mutate(tax=rownames(.), metric="qval")
    ) %>% left_join(., tax_table(dat) %>% as.data.frame() %>% select(genus) %>% rownames_to_column("tax") %>% distinct()) %>% 
      select(-tax) %>%
      rename("tax"="genus") %>% 
      pivot_longer(-c(tax, metric), names_to = "comparison", values_to="value") , 
    file=paste0(filename_base, ".csv"),
    row.names = FALSE)
 }





 ##################3  Agggregation
 aggregate_DA_runs <- function(maaslin_path, metagenomeSeq_path, ancom_path, ancombc_path, corncob_rdata, out_prefix, feature_of_interest="regimen_coll_upd"){
  
  maaslin_tax <- read.csv(maaslin_path, sep="\t") %>%
    filter(metadata=="regimen_coll_upd") %>% filter(qval < .05)
  mg_tax <- read.csv(metagenomeSeq_path, sep="\t")  %>% filter(adjPvalues < .05)
  ancom_tax <- read.csv(ancom_path, sep=",")
  ancombc_tax <- read.csv(ancombc_path, sep=",") %>%  pivot_wider(names_from=metric, values_from = value) %>% 
    filter(grepl(feature_of_interest, comparison)) %>% filter(qval < .05)
  load(corncob_rdata, verbose = TRUE) # loads da_analysis
  
  
  
  # from https://github.com/bryandmartin/corncob/issues/93
  extractCoefficients <- function(diftest_res, taxa){
    
    #add names to all_models list
    names(diftest_res$all_models) <- taxa_names(diftest_res$data)
    
    # default is all taxa
    if (missing(taxa)) {
      taxa <- taxa_names(diftest_res$data)
    } else {
      taxa <- taxa
    }
    
    purrr::map(
      diftest_res$all_models[taxa],
      ~ stats::coef(.)
    )
    
  }
  
  
  # note that the da_analysis$significant_taxa include significance both in terms of dispersion and a
  sig_taxa_coefs <- purrr::map_dfr(
    .x = da_analysis$significant_taxa, 
    function(x){
      as.data.frame(extractCoefficients(da_analysis, taxa=x)[[1]] ) %>%  
        rownames_to_column("comparison") %>% 
        mutate(asv_key=x,
               genus= corncob::otu_to_taxonomy(OTU = x, data = da_analysis$data, level = "genus"),
               tool="corncob") %>%
        rename(coef=Estimate, stderr=`Std. Error`, adjPvalues=`Pr(>|t|)`) %>%
        select(-`t value`) %>%
        filter(grepl(paste0("mu.",feature_of_interest), comparison)) %>%
        mutate(comparison = gsub(paste0("mu.",feature_of_interest), "", comparison)) %>%
        filter(adjPvalues < .05) %>%
        select(-asv_key)
    }
  )
  
  
  venn_data <- list(
    "MaAslin2"=unique(maaslin_tax$feature),
    "metagenomeSeq"=unique(mg_tax$X),
    "ANCOM-II"=unique(ancom_tax$taxa_id),
    "ANCOM-BC"=unique(ancombc_tax$tax),
    "corncob"= unique(sig_taxa_coefs$genus)
  )
  venn_data
  (
    p_venn <- ggvenn::ggvenn(data=venn_data, show_percentage = FALSE, #show_elements = TRUE,text_size = 1,
                             fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#00FF00FF"),
                             stroke_size = 0.5, set_name_size = 4
    )
  )
  dir.create("results", showWarnings = FALSE)
  ggsave(filename = paste0(out_prefix, "_tools_venn.pdf"), width=4, height=4, plot = p_venn)
  
  p_upset <- ggplot(do.call(rbind,lapply(venn_data,data.frame)) %>% 
                      mutate(tool=gsub("(.*)\\..*", "\\1", rownames(.))) %>% 
                      remove_rownames() %>%   
                      distinct() %>%
                      rename(genus=1) %>% group_by(genus) %>% 
                      summarize(tools = list(tool)), aes(x=tools)) +   geom_bar() +
    scale_x_upset(reverse = FALSE)
  ggsave(filename = paste0(out_prefix, "_tools_upset.pdf"), width=4, height=4, plot = p_venn)
  
  
  results_table <- rbind(
    maaslin_tax %>% select(feature, coef, value, stderr, qval) %>% 
      rename(genus=feature, adjPvalues=qval, comparison=value) %>% mutate(tool="MaAsLin2") %>%
      mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
    mg_tax %>% select(X, any_of(grep(feature_of_interest, colnames(.), value = TRUE)), adjPvalues) %>% 
      mutate(stderr=NA, tool="metagenomeSeq") %>% 
      pivot_longer(cols = grep(feature_of_interest, colnames(.) , value=TRUE), names_to = "comparison", values_to="coef") %>% 
      rename(genus=X) %>%
      mutate(comparison=gsub("[^[:alnum:] ]", "_",gsub(feature_of_interest, "", comparison))),
    sig_taxa_coefs %>%
      mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
    ancom_tax %>% rename(genus=taxa_id) %>% 
      select(genus) %>% 
      mutate( coef =NA,   stderr=NA,  comparison=NA, adjPvalues=NA, tool="ANCOM-II"),
    ancombc_tax  %>% select(tax, coef, comparison, stderr, qval) %>% 
      rename(genus=tax, adjPvalues=qval) %>% mutate(tool="ANCOM-BC") %>%
      mutate(comparison=gsub("[^[:alnum:] ]", "_", gsub(feature_of_interest, "", comparison)))
    
  ) %>% 
    group_by(genus, comparison) %>% 
    mutate(n_tools=length(unique(tool))) %>%
    ungroup() %>%
    arrange(desc(n_tools), coef) 
  
  knitr::kable(results_table)
  
  write_excel_csv(results_table, file =  paste0(out_prefix, "_enrichment_table.csv"))
  p_coef <- ggplot(results_table %>% group_by(tool, comparison, sign(coef)) %>% 
                     mutate(relcoef = sign(coef)*(abs(coef)/max(abs(coef)))) %>%
                     group_by(genus,  comparison, tool, sign(coef), adjPvalues, n_tools) %>%
                     summarize(maxrelcoef = sign(coef) * max(abs(relcoef))) %>%
                     distinct() %>%  ungroup() %>% group_by(tool, genus) %>% 
                     filter(abs(maxrelcoef) == max(abs(maxrelcoef))) ,
                   aes(x=tool, y=reorder(genus, n_tools),  fill=maxrelcoef, alpha=adjPvalues)) +
    geom_point(aes(size=adjPvalues), 
               stroke=1, color="black", shape=21)  +
    scale_y_discrete(expand = c(.05, .05)) +
    scale_color_manual(values = c("white", "black")) +
    scale_fill_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)+
    scale_alpha(range=c(.9, .1)) + 
    scale_size(range=c(5, .1)) + 
    theme(panel.grid.major.y = element_line(color="grey70", size=.1), 
          axis.text.y = element_text(face="italic"),
          axis.text.x = element_text(angle=45, hjust=1)) +
    labs(title="", y="Genus", x="Tool", fill = "Realtive Coefficent\n(per tool)", alpha="Corrected P-value", size="Corrected P-value") 
  
  if (!length(unique(results_table$comparison)) == 1){
    
    ggsave(p_coef + facet_wrap(~comparison, scales = "free_y"), filename =  paste0(out_prefix, "coefficients_by_tool.pdf"), width=10, height=7)
  } else{
    ggsave(p_coef, filename =  paste0(out_prefix, "coefficients_by_tool.pdf"), width=6, height=5)
  }
  return(list(table=results_table, venn=venn_data))
 }
	# Source this for a handy discrete color palette and various helper functions for microbiome analyses.
	# not complex enough for an actual package. For now... #featurecreep
	mycolors=c(
	"#511087", "#4ecdc4", "#ffd447", "#13c01f", "#b78900", "#e66d17",
	"#8e3400", "#729b79", "#bf0000", "#907ad6", "#3f6356", "#ffbfb7",
	"#92bccc", "#c0e8ab", "#890466", "#382438", "#dad2bc", "#0048ff",
	"#f5f749", "#146d25", "#c08552", "#fc60a8", "#89fc00", "#3df8ff",
	"#ef2917", "#101f75", "#bfcccc"
	)

	stripplot_theme <- theme(
	axis.text.x = element_text(angle=45, hjust=1),
	panel.border = element_blank(),
	panel.background = element_blank(),
	strip.background = element_rect(size=1, fill = "grey90", color="black"),
	axis.line = element_line(size=1),
	axis.line.y.right = element_line(color="green", linewidth=5),
	axis.line.x.top = element_blank(),
	)


	############################### Differential Abundance Helpers
	############# metagenomeSeq
	run_metagenomeSeq <- function(cmetadata, matrx, tax, formula_string, feature_of_interest, taxlevel="genus",
	filename_base="metagenomeSeq_da_analysis", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1){
	require(metagenomeSeq)
	init_samples_of_interest <-cmetadata %>% filter(rownames(.) %in% rownames(matrx)) %>% rownames()
	ASV_table <- t(matrx[rownames(matrx) %in% init_samples_of_interest, ])
	#feature_data <- AnnotatedDataFrame(data.frame("ASV"=rownames(ASV_table), row.names = rownames(ASV_table)))

	test_obj<- newMRexperiment(
	counts = ASV_table[, init_samples_of_interest],
	phenoData = AnnotatedDataFrame(cmetadata[init_samples_of_interest, ]),
	featureData = AnnotatedDataFrame(as.data.frame(as.data.frame(tax) %>% mutate(ASV=rownames(.))))[rownames(ASV_table)]#feature_data[rownames(filtered_counts), ]
	)
	print(paste("aggregating at", taxlevel))
	test_obj_attax <- aggregateByTaxonomy(test_obj, lvl = taxlevel)
	print(str_glue("filtering input data to features over {prev_thresh*100}% prevalence; please perform any abundance filtering prior to execution"))
	test_obj_filtered_attax <- filterData(test_obj_attax, present=nrow(test_obj@phenoData@data) * prev_thresh)
	test_obj_filtered_attax <- filterData(test_obj_filtered_attax, depth = 1)
	samples_of_interest <- colnames(MRcounts(test_obj_filtered_attax))[colSums(MRcounts(test_obj_filtered_attax))!=0]

	print(str_glue("initial number of samples: {length(init_samples_of_interest)}; filtered: {length(samples_of_interest)}"))
	print(str_glue("initial number of features: {length(rownames(test_obj))}; filtered: {length(rownames(test_obj_filtered_attax))}"))

	print("Calculating normalization")
	p <- cumNormStat(test_obj_filtered_attax, pFlag = T)
	test_obj_filtered_norm <- cumNorm(test_obj_filtered_attax, p=p)


	mod_class <- model.matrix(
	formula(str_glue("~ {formula_string}")),
	data=pData(test_obj_filtered_norm)
	)
	print("running fitZig; if this errors, consider checking for and removing NAs in metadata")

	regres_class <- fitZig(test_obj_filtered_norm, mod_class)

	coefs_class <- MRcoefs(
	regres_class,
	coef = grepl(feature_of_interest, colnames(mod_class)),
	group = 3, number = 50)
	res_table_class <- MRfulltable(regres_class, number = length(rownames(ASV_table)) )

	print(str_glue("writing output table to {paste0(filename_base, '.tab')}"))

	write.table(res_table_class, file=paste0(filename_base, ".tab"), quote=F, sep="\t", col.names = NA)



	if (is.numeric(cmetadata[, feature_of_interest])){
	pmetagenomeseq <- ggplot(
	coefs_class %>% filter(adjPvalues < pval_thresh) %>% rownames_to_column("taxa") %>%
	filter(abs(get(feature_of_interest)) > abs_coef_thresh),
	aes(y=reorder(taxa, adjPvalues), x=get(feature_of_interest), color=get(feature_of_interest))) +
	geom_point(size=3) +
	labs(
	title="metagenomeSeq Enrichment Analysis",
	x=feature_of_interest,
	color=feature_of_interest,
	y="Taxa") +
	theme(legend.position = "bottom") + scale_color_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)
	} else {
	pmetagenomeseq <- ggplot(
	coefs_class %>%
	rownames_to_column("taxa") %>%
	pivot_longer(cols=colnames(.)[grepl(feature_of_interest, colnames(.))],
	names_to = "feature", values_to = "val") %>%
	filter(adjPvalues < pval_thresh) %>%
	filter(abs(val) > abs_coef_thresh),
	aes(y=reorder(taxa, adjPvalues), x=val, color=feature)) +
	geom_point(size=3) +
	scale_color_discrete(guide="none") +
	facet_wrap(~feature) +
	labs(
	title="metagenomeSeq Enrichment Analysis",
	x="Coefficient ",
	color=feature_of_interest,
	y="Taxa") +
	theme(legend.position = "bottom")

	}

	print(str_glue("writing output figure to {paste0(filename_base, '.png')}"))

	ggsave(pmetagenomeseq, filename = paste0(filename_base, ".png"), width=7, height=10)

	}


	########### Corncob
	run_corncob <- function(physeq, formula_string, feature_of_interest, prev_thresh=.05, taxlevel="genus", filename_base="corncob_da_analysis"){
	require(corncob)
	print(str_glue("aggregating taxa at {taxlevel} level"))
	dat <- tax_glom(physeq, taxlevel)

	dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)

	print("constructing model null formula:")
	null_formula_string = gsub("\\+\\+", "\\+", gsub("\\+$", "", gsub("^\\+", "",gsub(feature_of_interest , "", formula_string))))
	print(null_formula_string)

	print("running corncob")
	da_analysis <- differentialTest(
	#try_only = "asv_1",
	formula = formula(str_glue("~{formula_string}")),
	formula_null = formula(str_glue("~{null_formula_string}")),
	phi.formula = formula(str_glue("~{formula_string}")),
	phi.formula_null = formula(str_glue("~{null_formula_string}")),
	test = "Wald", boot = FALSE,
	data = dat,
	fdr_cutoff = 0.05, verbose=TRUE)
	print("saving corncob object")
	save(da_analysis, file = paste0(filename_base, ".Rdata"))


	png(filename = paste0(filename_base, ".png"), width = 9, height = 5, units = "in", res = 300)
	plot(da_analysis)
	dev.off()
	}





	##################### Ancom
	run_Ancom <- function(physeq, formula_string,rand_formula_string=NULL, feature_of_interest, taxlevel="genus",
	filename_base="ancom", pval_thresh=.05, prev_thresh=.05, abs_coef_thresh=1, group_var="intensity_update"){
	print(str_glue("Loading ANCOM code"))

	# changed struture
	# source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/master/scripts/ancom_v2.1.R")
	source("https://raw.githubusercontent.com/FrederickHuangLin/ANCOM/8abbfb67ed1cb9ae366ab6349e0ad8f017ca6512/scripts/ancom_v2.1.R")

	print(str_glue("Data preprocessing"))

	print(paste("aggregating at", taxlevel))
	dat <- tax_glom(physeq, taxlevel)


	prepro = feature_table_pre_process(
	feature_table = otu_table(dat) %>% data.frame() %>% t(),
	meta_data=sample_data(dat) %>%data.frame() %>% rownames_to_column("sampleid"),
	sample_var="sampleid", group_var=group_var,
	out_cut= 0.05, zero_cut= 0.90, lib_cut = 0, neg_lb=TRUE)
	feature_table = prepro$feature_table # Preprocessed feature table
	meta_data = prepro$meta_data # Preprocessed metadata
	struc_zero = prepro$structure_zeros # Structural zero info


	# Step 2: ANCOM
	print(str_glue("Running ANCOM"))
	t_start = Sys.time()
	res = ANCOM(
	feature_table = prepro$feature_table,
	meta_data = prepro$meta_data,
	struc_zero=prepro$structure_zeros ,# Structural zero info,
	main_var = feature_of_interest,
	p_adj_method="BH",
	alpha = 0.05,
	adj_formula=formula_string,
	rand_formula=rand_formula_string,
	control = lmeControl(maxIter = 100, msMaxIter = 100, opt = "optim"))
	t_end = Sys.time()
	t_run = t_end - t_start
	print(str_glue("Done running ANCOM: took {t_run} "))

	write_csv(res$out, str_glue("{filename_base}.csv"))

	# Step 3: Volcano Plot

	# Number of taxa except structural zeros
	n_taxa = ifelse(is.null(struc_zero), nrow(feature_table), sum(apply(struc_zero, 1, sum) == 0))
	# Cutoff values for declaring differentially abundant taxa
	cut_off = c(0.9 * (n_taxa -1), 0.8 * (n_taxa -1), 0.7 * (n_taxa -1), 0.6 * (n_taxa -1))
	names(cut_off) = c("detected_0.9", "detected_0.8", "detected_0.7", "detected_0.6")

	# Annotation data
	dat_ann = data.frame(x = min(res$fig$data$x), y = cut_off["detected_0.9"], label = "W[0.9]")

	fig = res$fig +
	geom_hline(yintercept = cut_off["detected_0.7"], linetype = "dashed") +
	geom_text(data = dat_ann, aes(x = x, y = y, label = label),
	size = 4, vjust = -0.5, hjust = 0, color = "orange", parse = TRUE)
	fig


	}


	run_ANCOMBC <- function(physeq, formula_string, feature_of_interest, group_var, prev_thresh=.05, taxlevel="genus", filename_base="ancombc_da_analysis"){
	require(ANCOMBC)
	if(!any(rownames(tax) %in% colnames(matrx))){
	stop(" rownames of the tax object must be present in the colnames of the matrx")
	}
	print(str_glue("aggregating taxa at {taxlevel} level"))
	dat <- tax_glom(physeq, taxlevel)
	print(str_glue("prevalence filtering"))
	dat = filter_taxa(dat, function(x) sum(x > 0) > (prev_thresh*length(x)), TRUE)

	print("running ANCOM-BC")
	ancombc_obj = ancombc(
	phyloseq = dat,
	formula = formula_string,
	p_adj_method = "holm",
	zero_cut = 1-prev_thresh,
	lib_cut = 0,
	group = group_var,
	struc_zero = TRUE,
	neg_lb = TRUE,
	tol = 1e-5,
	max_iter = 100,
	conserve = TRUE,
	alpha = 0.05,
	global = TRUE)

	print("saving ANCOMBC object")
	save(ancombc_obj, file = paste0(filename_base, ".Rdata"))
	write.csv(
	rbind(
	ancombc_obj$res$beta %>% mutate(tax=rownames(.), metric="coef"),
	ancombc_obj$res$se %>% mutate(tax=rownames(.), metric="stderr"),
	ancombc_obj$res$q_val %>% mutate(tax=rownames(.), metric="qval")
	) %>% left_join(., tax_table(dat) %>% as.data.frame() %>% select(genus) %>% rownames_to_column("tax") %>% distinct()) %>%
	select(-tax) %>%
	rename("tax"="genus") %>%
	pivot_longer(-c(tax, metric), names_to = "comparison", values_to="value") ,
	file=paste0(filename_base, ".csv"),
	row.names = FALSE)
	}





	##################3 Agggregation
	aggregate_DA_runs <- function(maaslin_path, metagenomeSeq_path, ancom_path, ancombc_path, corncob_rdata, out_prefix, feature_of_interest="regimen_coll_upd"){

	maaslin_tax <- read.csv(maaslin_path, sep="\t") %>%
	filter(metadata=="regimen_coll_upd") %>% filter(qval < .05)
	mg_tax <- read.csv(metagenomeSeq_path, sep="\t") %>% filter(adjPvalues < .05)
	ancom_tax <- read.csv(ancom_path, sep=",")
	ancombc_tax <- read.csv(ancombc_path, sep=",") %>% pivot_wider(names_from=metric, values_from = value) %>%
	filter(grepl(feature_of_interest, comparison)) %>% filter(qval < .05)
	load(corncob_rdata, verbose = TRUE) # loads da_analysis



	# from https://github.com/bryandmartin/corncob/issues/93
	extractCoefficients <- function(diftest_res, taxa){

	#add names to all_models list
	names(diftest_res$all_models) <- taxa_names(diftest_res$data)

	# default is all taxa
	if (missing(taxa)) {
	taxa <- taxa_names(diftest_res$data)
	} else {
	taxa <- taxa
	}

	purrr::map(
	diftest_res$all_models[taxa],
	~ stats::coef(.)
	)

	}


	# note that the da_analysis$significant_taxa include significance both in terms of dispersion and a
	sig_taxa_coefs <- purrr::map_dfr(
	.x = da_analysis$significant_taxa,
	function(x){
	as.data.frame(extractCoefficients(da_analysis, taxa=x)[[1]] ) %>%
	rownames_to_column("comparison") %>%
	mutate(asv_key=x,
	genus= corncob::otu_to_taxonomy(OTU = x, data = da_analysis$data, level = "genus"),
	tool="corncob") %>%
	rename(coef=Estimate, stderr=`Std. Error`, adjPvalues=`Pr(>\|t\|)`) %>%
	select(-`t value`) %>%
	filter(grepl(paste0("mu.",feature_of_interest), comparison)) %>%
	mutate(comparison = gsub(paste0("mu.",feature_of_interest), "", comparison)) %>%
	filter(adjPvalues < .05) %>%
	select(-asv_key)
	}
	)


	venn_data <- list(
	"MaAslin2"=unique(maaslin_tax$feature),
	"metagenomeSeq"=unique(mg_tax$X),
	"ANCOM-II"=unique(ancom_tax$taxa_id),
	"ANCOM-BC"=unique(ancombc_tax$tax),
	"corncob"= unique(sig_taxa_coefs$genus)
	)
	venn_data
	(
	p_venn <- ggvenn::ggvenn(data=venn_data, show_percentage = FALSE, #show_elements = TRUE,text_size = 1,
	fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF", "#00FF00FF"),
	stroke_size = 0.5, set_name_size = 4
	)
	)
	dir.create("results", showWarnings = FALSE)
	ggsave(filename = paste0(out_prefix, "_tools_venn.pdf"), width=4, height=4, plot = p_venn)

	p_upset <- ggplot(do.call(rbind,lapply(venn_data,data.frame)) %>%
	mutate(tool=gsub("(.)\\..", "\\1", rownames(.))) %>%
	remove_rownames() %>%
	distinct() %>%
	rename(genus=1) %>% group_by(genus) %>%
	summarize(tools = list(tool)), aes(x=tools)) + geom_bar() +
	scale_x_upset(reverse = FALSE)
	ggsave(filename = paste0(out_prefix, "_tools_upset.pdf"), width=4, height=4, plot = p_venn)


	results_table <- rbind(
	maaslin_tax %>% select(feature, coef, value, stderr, qval) %>%
	rename(genus=feature, adjPvalues=qval, comparison=value) %>% mutate(tool="MaAsLin2") %>%
	mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
	mg_tax %>% select(X, any_of(grep(feature_of_interest, colnames(.), value = TRUE)), adjPvalues) %>%
	mutate(stderr=NA, tool="metagenomeSeq") %>%
	pivot_longer(cols = grep(feature_of_interest, colnames(.) , value=TRUE), names_to = "comparison", values_to="coef") %>%
	rename(genus=X) %>%
	mutate(comparison=gsub("[^[:alnum:] ]", "_",gsub(feature_of_interest, "", comparison))),
	sig_taxa_coefs %>%
	mutate(comparison=gsub("[^[:alnum:] ]", "_",comparison)),
	ancom_tax %>% rename(genus=taxa_id) %>%
	select(genus) %>%
	mutate( coef =NA, stderr=NA, comparison=NA, adjPvalues=NA, tool="ANCOM-II"),
	ancombc_tax %>% select(tax, coef, comparison, stderr, qval) %>%
	rename(genus=tax, adjPvalues=qval) %>% mutate(tool="ANCOM-BC") %>%
	mutate(comparison=gsub("[^[:alnum:] ]", "_", gsub(feature_of_interest, "", comparison)))

	) %>%
	group_by(genus, comparison) %>%
	mutate(n_tools=length(unique(tool))) %>%
	ungroup() %>%
	arrange(desc(n_tools), coef)

	knitr::kable(results_table)

	write_excel_csv(results_table, file = paste0(out_prefix, "_enrichment_table.csv"))
	p_coef <- ggplot(results_table %>% group_by(tool, comparison, sign(coef)) %>%
	mutate(relcoef = sign(coef)*(abs(coef)/max(abs(coef)))) %>%
	group_by(genus, comparison, tool, sign(coef), adjPvalues, n_tools) %>%
	summarize(maxrelcoef = sign(coef) * max(abs(relcoef))) %>%
	distinct() %>% ungroup() %>% group_by(tool, genus) %>%
	filter(abs(maxrelcoef) == max(abs(maxrelcoef))) ,
	aes(x=tool, y=reorder(genus, n_tools), fill=maxrelcoef, alpha=adjPvalues)) +
	geom_point(aes(size=adjPvalues),
	stroke=1, color="black", shape=21) +
	scale_y_discrete(expand = c(.05, .05)) +
	scale_color_manual(values = c("white", "black")) +
	scale_fill_gradient2(low = "brown", mid = "white", high = "forestgreen", midpoint = 0)+
	scale_alpha(range=c(.9, .1)) +
	scale_size(range=c(5, .1)) +
	theme(panel.grid.major.y = element_line(color="grey70", size=.1),
	axis.text.y = element_text(face="italic"),
	axis.text.x = element_text(angle=45, hjust=1)) +
	labs(title="", y="Genus", x="Tool", fill = "Realtive Coefficent\n(per tool)", alpha="Corrected P-value", size="Corrected P-value")

	if (!length(unique(results_table$comparison)) == 1){

	ggsave(p_coef + facet_wrap(~comparison, scales = "free_y"), filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=10, height=7)
	} else{
	ggsave(p_coef, filename = paste0(out_prefix, "coefficients_by_tool.pdf"), width=6, height=5)
	}
	return(list(table=results_table, venn=venn_data))
	}
No results found