|
#!/usr/bin/env python3 |
|
""" |
|
Test Taxoniq CLI against known divergences identified in BugSigDB issue #248. |
|
|
|
This script tests specific taxonomy IDs and scenarios that are known to have |
|
differences between Taxoniq and the NCBI API. |
|
|
|
Key known divergences: |
|
1. Taxoniq uses k__Bacteria (2) while NCBI may use newer kingdom ranks like |
|
k__Pseudomonadati (3379134) or k__Thermotogati (3384194) |
|
2. Taxoniq includes Eukaryota (2759) in lineages while NCBI may start at lower ranks |
|
3. Rank name changes: superkingdom -> domain (as of March 2025) |
|
4. Missing taxa in older Taxoniq DB |
|
""" |
|
|
|
import json |
|
import subprocess |
|
import sys |
|
|
|
def run_taxoniq_cli(tax_id, command='scientific-name'): |
|
"""Run taxoniq CLI and return the result.""" |
|
try: |
|
result = subprocess.run( |
|
['taxoniq', command, '--taxon-id', str(tax_id)], |
|
capture_output=True, |
|
text=True, |
|
timeout=5 |
|
) |
|
if result.returncode == 0: |
|
output = result.stdout.strip().strip('"') |
|
# Handle Enum representation |
|
if '<' in output and '>' in output: |
|
# Extract just the name part, e.g., "Rank.superkingdom" -> "superkingdom" |
|
parts = output.split('.') |
|
if len(parts) > 1: |
|
return parts[-1].split(':')[0].rstrip('>') |
|
return output |
|
else: |
|
return None |
|
except Exception as e: |
|
print(f" ERROR running CLI: {e}") |
|
return None |
|
|
|
def run_taxoniq_cli_json(tax_id, command='lineage'): |
|
"""Run taxoniq CLI with JSON output.""" |
|
try: |
|
result = subprocess.run( |
|
['taxoniq', command, '--taxon-id', str(tax_id), '--output-format', 'json'], |
|
capture_output=True, |
|
text=True, |
|
timeout=5 |
|
) |
|
if result.returncode == 0: |
|
output = result.stdout.strip() |
|
# Filter out non-JSON lines |
|
lines = output.split('\n') |
|
json_lines = [] |
|
in_json = False |
|
for line in lines: |
|
line = line.strip() |
|
if line.startswith('[') or line.startswith('{'): |
|
in_json = True |
|
if in_json: |
|
json_lines.append(line) |
|
|
|
if json_lines: |
|
json_str = '\n'.join(json_lines) |
|
return json.loads(json_str) |
|
return None |
|
else: |
|
return None |
|
except Exception as e: |
|
print(f" ERROR running CLI JSON: {e}") |
|
return None |
|
|
|
def test_bacteria_kingdom_divergence(): |
|
"""Test known divergence: Bacteria kingdom representation.""" |
|
print("\n[Test 1] Bacteria Kingdom Divergence") |
|
print("-" * 60) |
|
|
|
test_cases = [ |
|
(976, "Pseudomonas"), |
|
(1297, "Thermotoga"), |
|
(74201, "Helicobacter"), |
|
(562, "Escherichia coli"), |
|
] |
|
|
|
for tax_id, name in test_cases: |
|
print(f"\nTesting {name} (TaxID: {tax_id})") |
|
sci_name = run_taxoniq_cli(tax_id, 'scientific-name') |
|
rank = run_taxoniq_cli(tax_id, 'rank') |
|
|
|
if sci_name and rank: |
|
print(f" Scientific Name: {sci_name}") |
|
print(f" Rank: {rank}") |
|
|
|
# Get lineage |
|
lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage') |
|
if lineage_json and isinstance(lineage_json, list): |
|
print(f" Ranked lineage ({len(lineage_json)} entries):") |
|
for i, entry in enumerate(lineage_json[:10]): # Limit to first 10 |
|
if isinstance(entry, dict): |
|
print(f" - {entry.get('scientific_name', 'N/A')} ({entry.get('rank', 'N/A')}) [ID: {entry.get('tax_id', 'N/A')}]") |
|
else: |
|
print(f" Could not parse lineage") |
|
else: |
|
print(f" ✗ Could not fetch data for {tax_id}") |
|
|
|
def test_eukaryota_inclusion(): |
|
"""Test known divergence: Eukaryota in lineage.""" |
|
print("\n[Test 2] Eukaryota Inclusion Divergence") |
|
print("-" * 60) |
|
|
|
test_cases = [ |
|
(4751, "Fungi"), |
|
(6239, "Caenorhabditis elegans"), |
|
(3239874, "Saccharomyces cerevisiae"), |
|
] |
|
|
|
for tax_id, name in test_cases: |
|
print(f"\nTesting {name} (TaxID: {tax_id})") |
|
sci_name = run_taxoniq_cli(tax_id, 'scientific-name') |
|
|
|
if sci_name: |
|
lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage') |
|
|
|
if lineage_json and isinstance(lineage_json, list): |
|
lineage_ids = [entry.get('tax_id') for entry in lineage_json if isinstance(entry, dict)] |
|
|
|
print(f" Scientific Name: {sci_name}") |
|
print(f" Lineage IDs: {lineage_ids}") |
|
|
|
# Check for Eukaryota (2759) |
|
if 2759 in lineage_ids: |
|
euk_index = lineage_ids.index(2759) |
|
print(f" ✓ Eukaryota (2759) found at position {euk_index}") |
|
else: |
|
print(f" ⚠️ Eukaryota (2759) NOT in lineage") |
|
|
|
# Print full lineage |
|
print(f" Full lineage:") |
|
for entry in lineage_json: |
|
if isinstance(entry, dict): |
|
print(f" - {entry.get('scientific_name')} ({entry.get('rank')}) [ID: {entry.get('tax_id')}]") |
|
else: |
|
print(f" Could not parse lineage JSON") |
|
else: |
|
print(f" ✗ Could not fetch data for {tax_id}") |
|
|
|
def test_missing_taxa(): |
|
"""Test taxa known to be missing from Taxoniq.""" |
|
print("\n[Test 3] Missing Taxa in Taxoniq") |
|
print("-" * 60) |
|
|
|
# These taxa are known to be missing from the older Taxoniq DB |
|
missing_taxa = [ |
|
(1182571, "Candidatus Monteginia"), |
|
(1505663, "Unknown species 1"), |
|
(1535326, "Unknown species 2"), |
|
(1909303, "Unknown species 3"), |
|
(215579, "Unknown species 4"), |
|
(270497, "Unknown species 5"), |
|
(3379134, "k__Pseudomonadati (new kingdom rank)"), |
|
(424536, "Unknown species 6"), |
|
(541000, "Unknown species 7"), |
|
] |
|
|
|
missing_count = 0 |
|
found_count = 0 |
|
|
|
for tax_id, name in missing_taxa: |
|
result = run_taxoniq_cli(tax_id, 'scientific-name') |
|
if result: |
|
found_count += 1 |
|
rank = run_taxoniq_cli(tax_id, 'rank') |
|
print(f"\n✓ {name} (ID: {tax_id})") |
|
print(f" Scientific name: {result}") |
|
print(f" Rank: {rank}") |
|
else: |
|
missing_count += 1 |
|
print(f"\n✗ {name} (ID: {tax_id}) - NOT FOUND") |
|
|
|
print(f"\n Summary: {found_count} found, {missing_count} missing") |
|
|
|
def test_rank_name_changes(): |
|
"""Test organisms affected by rank name changes (March 2025).""" |
|
print("\n[Test 4] Rank Name Changes (March 2025 NCBI Update)") |
|
print("-" * 60) |
|
|
|
# These organisms are affected by superkingdom -> domain change |
|
test_cases = [ |
|
(2, "Bacteria"), |
|
(2157, "Archaea"), |
|
(2759, "Eukaryota"), |
|
(10239, "Viruses"), |
|
] |
|
|
|
for tax_id, name in test_cases: |
|
print(f"\nTesting {name} (TaxID: {tax_id})") |
|
rank = run_taxoniq_cli(tax_id, 'rank') |
|
|
|
if rank: |
|
print(f" Rank: {rank}") |
|
|
|
if rank == 'superkingdom': |
|
print(f" ⚠️ Uses old 'superkingdom' rank (pre-March 2025)") |
|
elif rank == 'domain': |
|
print(f" ✓ Uses new 'domain' rank (post-March 2025)") |
|
elif rank == 'acellular root': |
|
print(f" ✓ Viruses use 'acellular root' rank (post-March 2025)") |
|
else: |
|
print(f" ✗ Could not fetch data for {tax_id}") |
|
|
|
def test_representative_genomes(): |
|
"""Test RefSeq representative genome availability.""" |
|
print("\n[Test 5] RefSeq Representative Genomes") |
|
print("-" * 60) |
|
|
|
test_cases = [ |
|
(9606, "Homo sapiens"), |
|
(562, "Escherichia coli"), |
|
(6239, "Caenorhabditis elegans"), |
|
] |
|
|
|
for tax_id, name in test_cases: |
|
print(f"\nTesting {name} (TaxID: {tax_id})") |
|
result = subprocess.run( |
|
['taxoniq', 'refseq-representative-genome-accessions', '--taxon-id', str(tax_id), '--output-format', 'json'], |
|
capture_output=True, |
|
text=True, |
|
timeout=5 |
|
) |
|
|
|
if result.returncode == 0: |
|
try: |
|
# Parse the output which is a JSON array |
|
output = result.stdout.strip() |
|
# Remove any metadata lines |
|
lines = [l.strip() for l in output.split('\n') if l.strip() and not l.strip().startswith('Taxoniq')] |
|
json_str = '\n'.join(lines) |
|
data = json.loads(json_str) |
|
if isinstance(data, list): |
|
print(f" Found {len(data)} RefSeq representatives") |
|
if data: |
|
print(f" First 3: {data[:3]}") |
|
else: |
|
print(f" Unexpected output format") |
|
except Exception as e: |
|
print(f" Could not parse RefSeq output: {e}") |
|
else: |
|
print(f" ✗ Could not fetch RefSeq data") |
|
|
|
def test_cli_help_and_version(): |
|
"""Test basic CLI functionality.""" |
|
print("\n[Test 0] CLI Help and Version") |
|
print("-" * 60) |
|
|
|
# Test help |
|
result = subprocess.run(['taxoniq', '--help'], capture_output=True, text=True) |
|
if result.returncode == 0: |
|
print("✓ 'taxoniq --help' works") |
|
else: |
|
print("✗ 'taxoniq --help' failed") |
|
|
|
# Test version |
|
result = subprocess.run(['taxoniq', '--version'], capture_output=True, text=True) |
|
if result.returncode == 0: |
|
version = result.stdout.strip() |
|
print(f"✓ Taxoniq version: {version}") |
|
else: |
|
print("✗ Could not get version") |
|
|
|
def main(): |
|
print("=" * 60) |
|
print("Taxoniq CLI Tests - BugSigDB Issue #248 Divergences") |
|
print("=" * 60) |
|
|
|
try: |
|
test_cli_help_and_version() |
|
test_rank_name_changes() |
|
test_bacteria_kingdom_divergence() |
|
test_eukaryota_inclusion() |
|
test_missing_taxa() |
|
test_representative_genomes() |
|
|
|
print("\n" + "=" * 60) |
|
print("Testing complete!") |
|
print("=" * 60) |
|
|
|
except KeyboardInterrupt: |
|
print("\n\nTesting interrupted by user") |
|
sys.exit(1) |
|
except Exception as e: |
|
print(f"\n\nUnexpected error: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
sys.exit(1) |
|
|
|
if __name__ == "__main__": |
|
main() |