Last active
June 15, 2023 21:49
-
-
Save genomewalker/c671b8864c33fc0b9c5ccce68f40dc9d to your computer and use it in GitHub Desktop.
Combine taxdb data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| # Read in the data | |
| setwd("/maps/projects/lundbeck/scratch/taxDB/v6/metadata/src_files") | |
| ncbi_assm_stats <- list.files(".", pattern = "genome_metadata.txt", full.names = TRUE) | |
| ncbi_assm_stats <- map_dfr(ncbi_assm_stats, function(X) { | |
| read_tsv(X, col_names = TRUE) | |
| }) %>% | |
| select(-filename) %>% | |
| rename( | |
| L50 = ctg_N50, | |
| N50 = ctg_L50, | |
| N90 = ctg_L90, | |
| L90 = ctg_N90 | |
| ) | |
| kraken2_select_files <- list.files(".", pattern = "kraken2_select_accessions.txt", full.names = TRUE) | |
| kraken2_select_acc <- map_dfr(kraken2_select_files, function(X) { | |
| read_tsv(X, col_names = FALSE) %>% | |
| select(strain = X1) %>% | |
| mutate(accession = gsub("S__", "", strain)) | |
| }) | |
| fungi <- read_tsv("fungi-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "fungi", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| invertebrate <- read_tsv("invertebrate-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "invertebrate", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| organelle <- read_tsv("organelle-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "organelle", | |
| source = "NCBI_organelle", | |
| file = basename(file) | |
| ) | |
| organelle_post <- read_tsv("norway_taxid_accession-refined.tsv") %>% | |
| select(taxonomy = taxon, accession, representative, file = src) %>% | |
| mutate( | |
| library = "organelle", | |
| source = "PhyloNorway_organelle", | |
| file = basename(file) | |
| ) | |
| plant <- read_tsv("plant-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "plant", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| protozoa <- read_tsv("protozoa-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "protozoa", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| vertebrate_mammalian <- read_tsv("vertebrate_mammalian-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "vertebrate_mammalian", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| vertebrate_other <- read_tsv("vertebrate_other-assembly-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = "vertebrate_other", | |
| source = "NCBI_nuclear", | |
| file = basename(file) | |
| ) | |
| gtdb <- read_tsv("gtdb_reps_tax.txt", col_names = c("accession", "taxonomy")) %>% | |
| mutate( | |
| library = NA, | |
| source = "GTDB", | |
| file = paste0(accession, "_genomic.fna.gz") | |
| ) | |
| imgvr <- read_tsv("imgvr-derep-genomes_results.tsv") %>% | |
| select(taxonomy, accession, representative, file = src) %>% | |
| mutate( | |
| library = NA, | |
| source = "IMGVR", | |
| file = basename(file) | |
| ) | |
| euk_post <- read_tsv("euk_custom_post_derep_taxonomy.txt", col_names = c("accession", "taxonomy")) %>% | |
| mutate( | |
| library = NA, | |
| file = paste0(accession, ".fa.gz") | |
| ) %>% | |
| mutate(source = case_when( | |
| (grepl("TARA", accession) | grepl("TOSAG", accession)) ~ "TARA_euk", | |
| grepl("MMETSP", accession) ~ "MMETSP", | |
| TRUE ~ "Phylonorway_nuclear" | |
| )) | |
| acc2taxid <- read_tsv("/maps/projects/lundbeck/scratch/taxDB/v6/taxonomy/acc2taxid.map.gz") %>% | |
| select(accession, taxid) | |
| q %>% | |
| write_tsv("taxdb-genome_stats-broad-v6.tsv.gz") | |
| # drep_db <- "/maps/projects/lundbeck/scratch/antonio/projects/taxDB/20220926/TaxDB_v6/results/invertebrate/derep_genomes/derep.db" | |
| # library(DBI) | |
| # library(RSQLite) | |
| # library(dbplyr) | |
| # con <- dbConnect(SQLite(), drep_db) | |
| # tbl(con, "stats") %>% | |
| # collect() %>% | |
| # separate( | |
| # col = taxonomy, | |
| # sep = ";", | |
| # into = c( | |
| # "domain", | |
| # "lineage", | |
| # "kingdom", | |
| # "phylum", | |
| # "class", | |
| # "order", | |
| # "family", | |
| # "genus", | |
| # "species" | |
| # ) | |
| # ) %>% | |
| # filter(species == "s__Heliconius melpomene") %>% View() | |
| # dbListTables(con) | |
| tax_d <- fungi %>% | |
| bind_rows(invertebrate) %>% | |
| bind_rows(organelle) %>% | |
| bind_rows(organelle_post) %>% | |
| bind_rows(plant) %>% | |
| bind_rows(protozoa) %>% | |
| bind_rows(vertebrate_mammalian) %>% | |
| bind_rows(vertebrate_other) %>% | |
| bind_rows(gtdb) %>% | |
| bind_rows(imgvr) %>% | |
| bind_rows(euk_post) %>% | |
| filter(accession %in% kraken2_select_acc$accession) | |
| tax_d %>% | |
| separate( | |
| col = taxonomy, | |
| sep = ";", | |
| into = c( | |
| "domain", | |
| "lineage", | |
| "kingdom", | |
| "phylum", | |
| "class", | |
| "order", | |
| "family", | |
| "genus", | |
| "species" | |
| ) | |
| ) %>% | |
| group_by(domain, species) %>% | |
| distinct() %>% | |
| group_by(domain) %>% | |
| count(sort = T) | |
| tax_d %>% | |
| group_by(source) %>% | |
| count(sort = T) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment