nickp60 · May 2, 2023 13:51
diff --git a/cumulative_domination_example.R b/cumulative_domination_example.R
 library(tidyverse)
 if (!"speedyseq" %in% installed.packages()) remotes::install_github("mikemc/speedyseq")
 library(speedyseq)

 download.file("https://figshare.com/ndownloader/files/33076496", "samples.tsv")
 #this file has duplicate rows, and has multiple rows per pool
 samples <- read.csv("samples.tsv") %>% distinct() %>% 
  select(-Pool, -Run, -ShotgunBatchID) %>% distinct()
 #download.file("https://figshare.com/ndownloader/files/26770931", "metadata.tsv")
 #metadata <-  read.csv("metadata.tsv") 
 download.file("https://figshare.com/ndownloader/files/26393788", "asv_counts_w.tsv")
 counts <- read.csv("asv_counts_w.tsv") 
 download.file("https://figshare.com/ndownloader/files/26770997", "taxonomy.tsv")
 taxonomy <- read.csv("taxonomy.tsv")

 phy <- phyloseq(
  sample_data(samples %>% column_to_rownames("SampleID")),
  tax_table(taxonomy %>% select(ASV, Kingdom:Genus) %>% column_to_rownames("ASV") %>% as.matrix()),
  otu_table(counts  %>% pivot_wider(names_from = "SampleID", values_from = "Count", values_fill = 0), taxa_are_rows = TRUE)
 ) %>%  subset_samples(DayRelativeToNearestHCT > -30 & DayRelativeToNearestHCT < 30) %>%
  transform_sample_counts(~./sum(.))

 phy_to_df <- function(phy){
  left_join(
    tax_table(phy) %>% as_tibble(),
    otu_table(phy) %>% as_tibble(),
  ) %>% 
    left_join(
      sample_data(phy) %>% as_tibble()
    ) 
 }

 make_cumu_domination_plot <- function(phy, taxa_of_interest, tax_level, domination_threshold=.3, metadata_group=NULL, any_taxa_of_interest=FALSE){
  taxa_absent <- taxa_of_interest[!taxa_of_interest %in% unique(tax_table(phy_genus)[, tax_level])]
  if (length(taxa_absent) > 0) warning(paste("Some", tax_level, "-level taxa of interest are absent from your dataset: ", paste(taxa_absent, collapse = ",", sep = " ")))
  print(paste("Aggregating to ", tax_level, "level"))
  phyglom <- speedyseq::tax_glom(phy, taxrank = tax_level) 
      
  phydf <- phy_to_df(phyglom) %>% mutate(dominated = .abundance  >= domination_threshold) 
  print("Calculating patients per group")
  if (length(metadata_group) == 0) {
      phydf$group = NA
  } else{
    phydf$group = phydf[[ metadata_group]]
  }
  phydf$taxlev = phydf[[tax_level]]
  phydf <- phydf %>%   
    group_by(group) %>%
    mutate(n_patients_in_group = n_distinct(PatientID)) %>% 
    ungroup()
  
  # do an inner join to subset to the taxa of interest
  taxdf <- data.frame(taxa_of_interest)
  colnames(taxdf) <- tax_level
  print("Subsetting to taxa of interest")
  phydf_of_interest <- phydf %>% inner_join(taxdf)
  
  print("Calculating cumulative domination; this can take a little while")
  if (any_taxa_of_interest){
    print("Aggregating domination across all taxa of interest")
    phydf_of_interest$taxlev <- "Any"
  }
  cumulative_domination <- purrr::map(.x = sort(unique(phydf_of_interest$DayRelativeToNearestHCT)), .f = function(x) {
    phydf_of_interest %>%
      filter(DayRelativeToNearestHCT <= x) %>%
      group_by(group, n_patients_in_group, PatientID, taxlev) %>%
      summarize(any_dominated = any(dominated), .groups="drop") %>%
      group_by(group, taxlev) %>%
      summarize(collection_day = x, pct_any_dominated = sum(any_dominated) / n_patients_in_group * 100, .groups="drop") %>% distinct()
  }) %>%
    bind_rows() %>%
    ungroup() 
  
  print("generating plot")
  p_cumu_dom <- ggplot(cumulative_domination,  aes(x = collection_day, y = pct_any_dominated, color = taxlev)) +
    geom_point() +
    geom_line() +
    theme_bw() +
    labs(x = "Date of collection", y = "Cumulative Domination", color=tax_level) 
  if (length(metadata_group) != 0) {
    p_cumu_dom <- p_cumu_dom + facet_grid(~group)
  }
  return(list("df" = cumulative_domination, "plot" = p_cumu_dom))
 }


 eskape_results <- make_cumu_domination_plot(
  phy = phy, 
  taxa_of_interest = c("Enterococcus", "Staphylococcus",  "Klebsiella", "Acinetobacter", "Pseudomonas",  "Enterobacter"),
  tax_level = "Genus",domination_threshold = .3, any_taxa_of_interest=TRUE
 ) 
 eskape_results_ungrouped <- make_cumu_domination_plot(
  phy = phy, 
  taxa_of_interest = c("Enterococcus", "Staphylococcus",  "Klebsiella", "Acinetobacter", "Pseudomonas",  "Enterobacter"),
  tax_level = "Genus",domination_threshold = .3, 
 ) 

 make_cumu_domination_plot(
  phy = phy, 
  taxa_of_interest = c("Enterococcaceae", "Enterobacteriaceae", "Staphylococcaceae", "Streptococcaceae"), 
  tax_level = "Family",domination_threshold = .3,
 )
	library(tidyverse)
	if (!"speedyseq" %in% installed.packages()) remotes::install_github("mikemc/speedyseq")
	library(speedyseq)

	download.file("https://figshare.com/ndownloader/files/33076496", "samples.tsv")
	#this file has duplicate rows, and has multiple rows per pool
	samples <- read.csv("samples.tsv") %>% distinct() %>%
	select(-Pool, -Run, -ShotgunBatchID) %>% distinct()
	#download.file("https://figshare.com/ndownloader/files/26770931", "metadata.tsv")
	#metadata <- read.csv("metadata.tsv")
	download.file("https://figshare.com/ndownloader/files/26393788", "asv_counts_w.tsv")
	counts <- read.csv("asv_counts_w.tsv")
	download.file("https://figshare.com/ndownloader/files/26770997", "taxonomy.tsv")
	taxonomy <- read.csv("taxonomy.tsv")

	phy <- phyloseq(
	sample_data(samples %>% column_to_rownames("SampleID")),
	tax_table(taxonomy %>% select(ASV, Kingdom:Genus) %>% column_to_rownames("ASV") %>% as.matrix()),
	otu_table(counts %>% pivot_wider(names_from = "SampleID", values_from = "Count", values_fill = 0), taxa_are_rows = TRUE)
	) %>% subset_samples(DayRelativeToNearestHCT > -30 & DayRelativeToNearestHCT < 30) %>%
	transform_sample_counts(~./sum(.))

	phy_to_df <- function(phy){
	left_join(
	tax_table(phy) %>% as_tibble(),
	otu_table(phy) %>% as_tibble(),
	) %>%
	left_join(
	sample_data(phy) %>% as_tibble()
	)
	}

	make_cumu_domination_plot <- function(phy, taxa_of_interest, tax_level, domination_threshold=.3, metadata_group=NULL, any_taxa_of_interest=FALSE){
	taxa_absent <- taxa_of_interest[!taxa_of_interest %in% unique(tax_table(phy_genus)[, tax_level])]
	if (length(taxa_absent) > 0) warning(paste("Some", tax_level, "-level taxa of interest are absent from your dataset: ", paste(taxa_absent, collapse = ",", sep = " ")))
	print(paste("Aggregating to ", tax_level, "level"))
	phyglom <- speedyseq::tax_glom(phy, taxrank = tax_level)

	phydf <- phy_to_df(phyglom) %>% mutate(dominated = .abundance >= domination_threshold)
	print("Calculating patients per group")
	if (length(metadata_group) == 0) {
	phydf$group = NA
	} else{
	phydf$group = phydf[[ metadata_group]]
	}
	phydf$taxlev = phydf[[tax_level]]
	phydf <- phydf %>%
	group_by(group) %>%
	mutate(n_patients_in_group = n_distinct(PatientID)) %>%
	ungroup()

	# do an inner join to subset to the taxa of interest
	taxdf <- data.frame(taxa_of_interest)
	colnames(taxdf) <- tax_level
	print("Subsetting to taxa of interest")
	phydf_of_interest <- phydf %>% inner_join(taxdf)

	print("Calculating cumulative domination; this can take a little while")
	if (any_taxa_of_interest){
	print("Aggregating domination across all taxa of interest")
	phydf_of_interest$taxlev <- "Any"
	}
	cumulative_domination <- purrr::map(.x = sort(unique(phydf_of_interest$DayRelativeToNearestHCT)), .f = function(x) {
	phydf_of_interest %>%
	filter(DayRelativeToNearestHCT <= x) %>%
	group_by(group, n_patients_in_group, PatientID, taxlev) %>%
	summarize(any_dominated = any(dominated), .groups="drop") %>%
	group_by(group, taxlev) %>%
	summarize(collection_day = x, pct_any_dominated = sum(any_dominated) / n_patients_in_group * 100, .groups="drop") %>% distinct()
	}) %>%
	bind_rows() %>%
	ungroup()

	print("generating plot")
	p_cumu_dom <- ggplot(cumulative_domination, aes(x = collection_day, y = pct_any_dominated, color = taxlev)) +
	geom_point() +
	geom_line() +
	theme_bw() +
	labs(x = "Date of collection", y = "Cumulative Domination", color=tax_level)
	if (length(metadata_group) != 0) {
	p_cumu_dom <- p_cumu_dom + facet_grid(~group)
	}
	return(list("df" = cumulative_domination, "plot" = p_cumu_dom))
	}


	eskape_results <- make_cumu_domination_plot(
	phy = phy,
	taxa_of_interest = c("Enterococcus", "Staphylococcus", "Klebsiella", "Acinetobacter", "Pseudomonas", "Enterobacter"),
	tax_level = "Genus",domination_threshold = .3, any_taxa_of_interest=TRUE
	)
	eskape_results_ungrouped <- make_cumu_domination_plot(
	phy = phy,
	taxa_of_interest = c("Enterococcus", "Staphylococcus", "Klebsiella", "Acinetobacter", "Pseudomonas", "Enterobacter"),
	tax_level = "Genus",domination_threshold = .3,
	)

	make_cumu_domination_plot(
	phy = phy,
	taxa_of_interest = c("Enterococcaceae", "Enterobacteriaceae", "Staphylococcaceae", "Streptococcaceae"),
	tax_level = "Family",domination_threshold = .3,
	)
No results found