lwaldron · January 30, 2026 08:05
diff --git a/BUGSIGDB_TESTING.md b/BUGSIGDB_TESTING.md
diff --git a/REBUILD_GUIDE.md b/REBUILD_GUIDE.md
diff --git a/rebuild_taxoniq.sh b/rebuild_taxoniq.sh
 #!/bin/bash
 # Quick start script to rebuild Taxoniq with current NCBI taxonomy
 # Usage: ./rebuild_taxoniq.sh [date-version]
 # Example: ./rebuild_taxoniq.sh 2026.01.29
 #
 # Caching: Downloads are cached in ~/.cache/taxoniq-rebuild/ to avoid
 # re-downloading large files on subsequent runs. Set TAXONIQ_CACHE_DIR
 # to override the cache location.

 set -e

 VERSION="${1:-2026.01.29}"
 # Use persistent cache directory instead of /tmp
 CACHE_DIR="${TAXONIQ_CACHE_DIR:-$HOME/.cache/taxoniq-rebuild}"
 WORKDIR="$CACHE_DIR/work-$$"
 TAXONIQ_REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

 echo "=========================================="
 echo "Taxoniq Taxonomy Rebuild Script"
 echo "=========================================="
 echo "Version: $VERSION"
 echo "Repository: $TAXONIQ_REPO"
 echo "Cache directory: $CACHE_DIR"
 echo "Work directory: $WORKDIR"
 echo

 # Step 1: Setup directories
 echo "[1/7] Setting up directories..."
 mkdir -p "$CACHE_DIR"
 mkdir -p "$WORKDIR"
 export BLASTDB="$CACHE_DIR/blast_databases"
 mkdir -p "$BLASTDB"
 cd "$WORKDIR"

 # Step 2: Download NCBI taxonomy dump
 echo "[2/7] Downloading NCBI taxonomy dump..."
 TAXDUMP_FILE="$CACHE_DIR/new_taxdump.tar.gz"
 TAXDUMP_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"

 if [[ ! -f "$TAXDUMP_FILE" ]]; then
    echo "  Fetching taxonomy dump from NCBI..."
    curl -s -o "$TAXDUMP_FILE" "$TAXDUMP_URL"
    echo "  ✓ Downloaded taxonomy dump to cache"
 else
    echo "  ✓ Using cached taxonomy dump"
 fi

 if [[ ! -f nodes.dmp ]]; then
    echo "  Extracting taxonomy files..."
    tar -xzf "$TAXDUMP_FILE"
    echo "  ✓ Taxonomy files ready"
 else
    echo "  ✓ Using cached extracted taxonomy files"
 fi

 # Step 3: Get latest BLAST database info
 echo "[3/7] Getting NCBI BLAST database metadata..."
 LATEST_DIR_FILE="$CACHE_DIR/latest-dir"
 if [[ ! -f "$LATEST_DIR_FILE" ]]; then
    echo "  Fetching BLAST database version info..."
    aws s3 cp --no-sign-request s3://ncbi-blast-databases/latest-dir "$LATEST_DIR_FILE" 2>/dev/null || {
        echo "  ⚠️  Could not fetch latest BLAST version, using fallback"
        echo "2026-01-29T00-43-36" > "$LATEST_DIR_FILE"
    }
    echo "  ✓ Downloaded BLAST version info to cache"
 else
    echo "  ✓ Using cached BLAST version info"
 fi
 BLAST_VERSION=$(cat "$LATEST_DIR_FILE")
 echo "  ✓ BLAST database version: $BLAST_VERSION"

 # Step 4: Download representative BLAST databases
 echo "[4/7] Downloading representative BLAST databases..."
 BLAST_CACHE_DIR="$CACHE_DIR/blast_databases/$BLAST_VERSION"
 mkdir -p "$BLAST_CACHE_DIR"

 # Check if we already have the BLAST databases
 BLAST_FILES_EXIST=false
 if [[ -d "$BLAST_CACHE_DIR" ]] && [[ -n "$(find "$BLAST_CACHE_DIR" -name "ref_*_rep_genomes*" -type f 2>/dev/null | head -1)" ]]; then
    BLAST_FILES_EXIST=true
    echo "  ✓ Using cached BLAST databases"
 else
    echo "  This may take 5-15 minutes (~20-30 GB)..."
    aws s3 sync --no-sign-request \
        s3://ncbi-blast-databases/$BLAST_VERSION/ "$BLAST_CACHE_DIR" \
        --exclude "*" \
        --include "ref_prok_rep_genomes*" \
        --include "ref_euk_rep_genomes*" \
        --include "ref_viruses_rep_genomes*" \
        2>/dev/null || {
        echo "  ⚠️  Warning: Could not download all BLAST databases"
        echo "     You can manually download them and set BLASTDB=$BLAST_CACHE_DIR"
        BLAST_FILES_EXIST=false
    }
    if [[ $? -eq 0 ]]; then
        echo "  ✓ BLAST databases downloaded to cache"
        BLAST_FILES_EXIST=true
    fi
 fi

 # Set BLASTDB to the cached location
 export BLASTDB="$BLAST_CACHE_DIR"

 # Step 5: Copy data and rebuild indexes
 echo "[5/7] Rebuilding Taxoniq indexes..."
 export PYTHONPATH="$TAXONIQ_REPO:$PYTHONPATH"
 cd "$TAXONIQ_REPO"
 cp "$WORKDIR"/nodes.dmp .
 cp "$WORKDIR"/names.dmp .
 [[ -f "$WORKDIR/merged.dmp" ]] && cp "$WORKDIR/merged.dmp" .
 [[ -f "$WORKDIR/delnodes.dmp" ]] && cp "$WORKDIR/delnodes.dmp" .

 echo "  Building indexes (this takes 10-30 minutes)..."
 python3 -m taxoniq.build trees || {
    echo "  ✗ Build failed!"
    exit 1
 }
 echo "  ✓ Indexes rebuilt"

 # Step 6: Update version numbers
 echo "[6/7] Updating version numbers to $VERSION..."
 python3 << EOF
 import re
 import os

 files_to_update = [
    "setup.py",
    "db_packages/ncbi_taxon_db/setup.py",
    "db_packages/ncbi_refseq_accession_db/setup.py",
    "db_packages/ncbi_refseq_accession_lengths/setup.py",
    "db_packages/ncbi_refseq_accession_offsets/setup.py",
 ]

 for filepath in files_to_update:
    if not os.path.exists(filepath):
        print(f"  ⚠️  {filepath} not found")
        continue
    
    with open(filepath, 'r') as f:
        content = f.read()
    
    # Update version number
    updated = re.sub(
        r'version\s*=\s*["\'][\d.]+["\']',
        f'version="{os.environ.get("VERSION", "2026.01.29")}""',
        content
    )
    
    # Update ncbi-taxon-db requirement
    updated = re.sub(
        r'ncbi-taxon-db\s*>=\s*[\d.]+',
        f'ncbi-taxon-db >= {os.environ.get("VERSION", "2026.01.29")}',
        updated
    )
    updated = re.sub(
        r'ncbi-taxon-db\s*==\s*[\d.]+',
        f'ncbi-taxon-db == {os.environ.get("VERSION", "2026.01.29")}',
        updated
    )
    
    # Update other package requirements
    for pkg in ['ncbi-refseq-accession-db', 'ncbi-refseq-accession-lengths', 'ncbi-refseq-accession-offsets']:
        updated = re.sub(
            rf'{pkg}\s*==\s*[\d.]+',
            f'{pkg} == {os.environ.get("VERSION", "2026.01.29")}',
            updated
        )
    
    with open(filepath, 'w') as f:
        f.write(updated)
    
    print(f"  ✓ Updated {filepath}")

 EOF

 # Step 7: Test and report
 echo "[7/7] Testing updated Taxoniq..."
 pip install --force-reinstall --no-cache-dir -q -e . 2>/dev/null
 pip install --force-reinstall --no-cache-dir -q \
    db_packages/ncbi_taxon_db \
    db_packages/ncbi_refseq_accession_db \
    db_packages/ncbi_refseq_accession_lengths \
    db_packages/ncbi_refseq_accession_offsets 2>/dev/null

 python3 << EOF
 import taxoniq
 import sys

 try:
    # Test basic functionality
    t = taxoniq.Taxon(9606)  # Human
    assert t.scientific_name == "Homo sapiens", "Human lookup failed"
    
    # Test rank names
    bacteria = taxoniq.Taxon(2)
    print(f"  ✓ Bacteria rank: {bacteria.rank.name}")
    
    # Test lineage
    e_coli = taxoniq.Taxon(562)
    lineage_ranks = [entry.rank.name for entry in e_coli.ranked_lineage]
    print(f"  ✓ E. coli lineage has {len(e_coli.ranked_lineage)} entries")
    
    # Test RefSeq
    h_sapiens = taxoniq.Taxon(9606)
    genomes = h_sapiens.refseq_representative_genome_accessions
    print(f"  ✓ H. sapiens has {len(genomes)} RefSeq representative genomes")
    
    print("\n✅ All tests passed!")
    
 except Exception as e:
    print(f"❌ Test failed: {e}")
    import traceback
    traceback.print_exc()
    sys.exit(1)

 EOF

 # Final summary
 echo
 echo "=========================================="
 echo "✅ Rebuild Complete!"
 echo "=========================================="
 echo
 echo "Summary:"
 echo "  Version: $VERSION"
 echo "  Cache directory: $CACHE_DIR"
 echo "  Work directory: $WORKDIR"
 echo "  Updated files:"
 echo "    - setup.py"
 echo "    - db_packages/*/setup.py"
 echo "    - db_packages/*/ncbi_*/version.py"
 echo
 echo "Next steps:"
 echo "  1. Review changes: git diff"
 echo "  2. Test thoroughly: python3 test_bugsigdb_divergences.py"
 echo "  3. Commit: git add -A && git commit -m 'Update to NCBI taxonomy $VERSION'"
 echo "  4. Tag: git tag -a v1.0.5 -m 'Taxoniq v1.0.5 with NCBI taxonomy $VERSION'"
 echo
 echo "Optional cleanup:"
 echo "  rm -rf $WORKDIR"
 echo "  rm -rf $CACHE_DIR  # Remove all cached files"
 echo
diff --git a/run_bugsigdb_tests.sh b/run_bugsigdb_tests.sh
 #!/bin/bash
 # Quick reference for running BugSigDB divergence tests

 set -e

 echo "======================================================================"
 echo "Taxoniq BugSigDB Divergence Tests - Quick Start"
 echo "======================================================================"
 echo ""

 # Check if venv exists
 if [ ! -d ".venv" ]; then
    echo "Creating virtual environment..."
    python -m venv .venv
 fi

 # Activate venv
 echo "Activating virtual environment..."
 source .venv/bin/activate

 # Install dependencies if needed
 echo "Installing taxoniq..."
 pip install -q -e . 2>/dev/null || true

 echo ""
 echo "======================================================================"
 echo "[Test 1/3] Running BugSigDB Divergences Test (Python API)"
 echo "======================================================================"
 python test_bugsigdb_divergences.py

 echo ""
 echo "======================================================================"
 echo "[Test 2/3] Running PR Test"
 echo "======================================================================"
 python test_taxoniq_pr.py

 echo ""
 echo "======================================================================"
 echo "[Test 3/3] Running CLI Divergences Test"
 echo "======================================================================"
 python test_taxoniq_cli_divergences.py || echo "Note: Some CLI tests may have format differences"

 echo ""
 echo "======================================================================"
 echo "All tests complete!"
 echo "======================================================================"
 echo ""
 echo "For detailed results and analysis, see BUGSIGDB_TESTING.md"
diff --git a/test_bugsigdb_divergences.py b/test_bugsigdb_divergences.py
 #!/usr/bin/env python3
 """
 Test Taxoniq Python API against known divergences identified in BugSigDB issue #248.

 This test uses the Python API to directly test taxonomy lookups and compares them
 to known divergences between Taxoniq and the NCBI API.

 Reference: https://github.com/waldronlab/BugSigDB/issues/248
 """

 import taxoniq
 import sys

 def test_rank_name_changes():
    """Test organisms affected by rank name changes (March 2025 NCBI update)."""
    print("\n[Test 1] Rank Name Changes (March 2025 NCBI Update)")
    print("-" * 70)
    print("Note: Taxoniq uses older NCBI data (Sept 2024)")
    print("Expected: superkingdom rank (not yet updated to 'domain')")
    print()
    
    test_cases = [
        (2, "Bacteria"),
        (2157, "Archaea"),
        (2759, "Eukaryota"),
        (10239, "Viruses"),
    ]
    
    for tax_id, name in test_cases:
        try:
            t = taxoniq.Taxon(tax_id)
            rank_name = t.rank.name if hasattr(t.rank, 'name') else str(t.rank)
            print(f"{name:20} (ID: {tax_id:5}) - Rank: {rank_name}")
            
            if rank_name == 'superkingdom':
                print(f"  ⚠️  Uses old 'superkingdom' rank (expected for Sept 2024 data)")
            elif rank_name == 'domain':
                print(f"  ✓ Uses new 'domain' rank (updated to March 2025 NCBI)")
        except taxoniq.NoValue as e:
            print(f"{name:20} (ID: {tax_id:5}) - NOT FOUND")
        except Exception as e:
            print(f"{name:20} (ID: {tax_id:5}) - ERROR: {e}")

 def test_bacteria_kingdom_divergence():
    """Test known divergence: Bacteria kingdom representation."""
    print("\n[Test 2] Bacteria Kingdom Divergence")
    print("-" * 70)
    print("Known divergence: Taxoniq may use generic k__Bacteria (2)")
    print("while NCBI may use newer kingdom ranks like Pseudomonadati (3379134)")
    print()
    
    test_cases = [
        (976, "Pseudomonas", "Should have phylum in lineage"),
        (1297, "Thermotoga", "Should have phylum in lineage"),
        (74201, "Helicobacter", "Should have phylum in lineage"),
        (562, "Escherichia coli", "Species with complete lineage"),
    ]
    
    for tax_id, name, description in test_cases:
        try:
            t = taxoniq.Taxon(tax_id)
            print(f"\n{name} (ID: {tax_id})")
            print(f"  {description}")
            print(f"  Scientific name: {t.scientific_name}")
            print(f"  Rank: {t.rank.name if hasattr(t.rank, 'name') else t.rank}")
            
            # Get lineage
            lineage = t.ranked_lineage
            print(f"  Lineage ({len(lineage)} entries):")
            for i, entry in enumerate(lineage):
                rank_name = entry.rank.name if hasattr(entry.rank, 'name') else str(entry.rank)
                print(f"    {i+1}. {entry.scientific_name:30} [{rank_name:15}] (ID: {entry.tax_id})")
                if i >= 7:  # Limit output
                    print(f"    ... ({len(lineage) - 8} more entries)")
                    break
            
            # Check for Bacteria
            bacteria_found = False
            for entry in lineage:
                if 'bacteria' in entry.scientific_name.lower():
                    bacteria_found = True
                    print(f"  ✓ Bacteria entry found: {entry.scientific_name} (ID: {entry.tax_id})")
                    break
            
            if not bacteria_found and tax_id != 562:  # E. coli might be different
                print(f"  ⚠️  No 'Bacteria' entry in lineage")
                
        except taxoniq.NoValue as e:
            print(f"{name} (ID: {tax_id}) - NOT FOUND in Taxoniq")
        except Exception as e:
            print(f"{name} (ID: {tax_id}) - ERROR: {e}")

 def test_eukaryota_inclusion():
    """Test known divergence: Eukaryota in lineage."""
    print("\n[Test 3] Eukaryota Inclusion Divergence")
    print("-" * 70)
    print("Known divergence: Taxoniq includes Eukaryota (2759) in lineages")
    print("while NCBI may start at lower ranks like Fungi (4751)")
    print()
    
    test_cases = [
        (4751, "Fungi", "Kingdom-level organism"),
        (6239, "Caenorhabditis elegans", "Nematode"),
        (9606, "Homo sapiens", "Mammal/Human"),
    ]
    
    for tax_id, name, description in test_cases:
        try:
            t = taxoniq.Taxon(tax_id)
            print(f"\n{name} (ID: {tax_id})")
            print(f"  {description}")
            print(f"  Scientific name: {t.scientific_name}")
            
            lineage = t.ranked_lineage
            lineage_ids = [entry.tax_id for entry in lineage]
            
            print(f"  Lineage IDs: {lineage_ids}")
            
            # Check for Eukaryota (2759)
            if 2759 in lineage_ids:
                euk_index = lineage_ids.index(2759)
                print(f"  ✓ Eukaryota (2759) found at position {euk_index}")
            else:
                print(f"  ⚠️  Eukaryota (2759) NOT in lineage (NCBI divergence)")
            
            # Print lineage with ranks
            print(f"  Full lineage ({len(lineage)} entries):")
            for i, entry in enumerate(lineage):
                rank_name = entry.rank.name if hasattr(entry.rank, 'name') else str(entry.rank)
                print(f"    {i+1}. {entry.scientific_name:25} [{rank_name:15}] ID: {entry.tax_id}")
                
        except taxoniq.NoValue as e:
            print(f"{name} (ID: {tax_id}) - NOT FOUND in Taxoniq")
        except Exception as e:
            print(f"{name} (ID: {tax_id}) - ERROR: {e}")

 def test_missing_taxa():
    """Test taxa known to be missing from Taxoniq."""
    print("\n[Test 4] Missing Taxa in Taxoniq")
    print("-" * 70)
    print("These taxa are known to be missing from the Sept 2024 Taxoniq DB")
    print()
    
    missing_taxa = [
        (1182571, "Candidatus Monteginia"),
        (1505663, "Unknown species 1"),
        (1535326, "Unknown species 2"),
        (1909303, "Unknown species 3"),
        (3379134, "k__Pseudomonadati (new kingdom rank, March 2025)"),
        (424536, "Unknown species 6"),
        (541000, "Unknown species 7"),
    ]
    
    missing_count = 0
    found_count = 0
    
    for tax_id, name in missing_taxa:
        try:
            t = taxoniq.Taxon(tax_id)
            found_count += 1
            rank_name = t.rank.name if hasattr(t.rank, 'name') else str(t.rank)
            print(f"✓ {name:40} (ID: {tax_id})")
            print(f"  Scientific name: {t.scientific_name}, Rank: {rank_name}")
        except taxoniq.NoValue:
            missing_count += 1
            print(f"✗ {name:40} (ID: {tax_id}) - NOT FOUND")
        except Exception as e:
            print(f"✗ {name:40} (ID: {tax_id}) - ERROR: {e}")
    
    print(f"\n  Summary: {found_count} found, {missing_count} missing")
    if found_count > 0:
        print(f"  ⚠️  Database may have been updated since issue was filed")

 def test_refseq_availability():
    """Test RefSeq genome availability."""
    print("\n[Test 5] RefSeq Representative Genome Availability")
    print("-" * 70)
    
    test_cases = [
        (9606, "Homo sapiens", "Human"),
        (562, "Escherichia coli", "E. coli"),
        (6239, "Caenorhabditis elegans", "Worm"),
    ]
    
    for tax_id, name, common in test_cases:
        try:
            t = taxoniq.Taxon(tax_id)
            print(f"\n{name} ({common})")
            print(f"  Taxon ID: {tax_id}")
            
            # Get representative genomes
            try:
                rep_genomes = t.refseq_representative_genome_accessions
                print(f"  ✓ RefSeq representative genomes: {len(rep_genomes)} available")
                if rep_genomes:
                    print(f"    First 3: {rep_genomes[:3]}")
            except taxoniq.NoValue:
                print(f"  ℹ️  No RefSeq representative genomes indexed")
            except Exception as e:
                print(f"  ⚠️  Error fetching RefSeq genomes: {e}")
                
        except Exception as e:
            print(f"{name} ({common}) - ERROR: {e}")

 def test_performance():
    """Test performance of Taxoniq lookups."""
    print("\n[Test 6] Performance Test")
    print("-" * 70)
    
    import time
    
    test_ids = [9606, 562, 4751, 2, 2759, 10239, 131567]  # Various taxa
    
    print(f"Testing {len(test_ids)} taxa lookups...")
    
    start = time.time()
    successful = 0
    
    for tax_id in test_ids:
        try:
            t = taxoniq.Taxon(tax_id)
            _ = t.scientific_name  # Force actual lookup
            _ = t.rank
            _ = t.ranked_lineage
            successful += 1
        except:
            pass
    
    elapsed = time.time() - start
    avg_time = (elapsed * 1000) / successful if successful > 0 else 0
    
    print(f"✓ Completed {successful}/{len(test_ids)} lookups")
    print(f"  Total time: {elapsed*1000:.1f} ms")
    print(f"  Average per lookup: {avg_time:.1f} ms")
    print(f"  Note: NCBI API would take seconds per lookup")

 def main():
    print("=" * 70)
    print("Taxoniq Tests - BugSigDB Issue #248 Divergences")
    print("Using Taxoniq " + taxoniq.__version__)
    print("=" * 70)
    
    try:
        test_rank_name_changes()
        test_bacteria_kingdom_divergence()
        test_eukaryota_inclusion()
        test_missing_taxa()
        test_refseq_availability()
        test_performance()
        
        print("\n" + "=" * 70)
        print("Testing complete!")
        print("=" * 70)
        
    except KeyboardInterrupt:
        print("\n\nTesting interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nUnexpected error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

 if __name__ == "__main__":
    main()
diff --git a/TEST_results.md b/TEST_results.md
diff --git a/test_taxoniq_cli_divergences.py b/test_taxoniq_cli_divergences.py
 #!/usr/bin/env python3
 """
 Test Taxoniq CLI against known divergences identified in BugSigDB issue #248.

 This script tests specific taxonomy IDs and scenarios that are known to have
 differences between Taxoniq and the NCBI API.

 Key known divergences:
 1. Taxoniq uses k__Bacteria (2) while NCBI may use newer kingdom ranks like
   k__Pseudomonadati (3379134) or k__Thermotogati (3384194)
 2. Taxoniq includes Eukaryota (2759) in lineages while NCBI may start at lower ranks
 3. Rank name changes: superkingdom -> domain (as of March 2025)
 4. Missing taxa in older Taxoniq DB
 """

 import json
 import subprocess
 import sys

 def run_taxoniq_cli(tax_id, command='scientific-name'):
    """Run taxoniq CLI and return the result."""
    try:
        result = subprocess.run(
            ['taxoniq', command, '--taxon-id', str(tax_id)],
            capture_output=True,
            text=True,
            timeout=5
        )
        if result.returncode == 0:
            output = result.stdout.strip().strip('"')
            # Handle Enum representation
            if '<' in output and '>' in output:
                # Extract just the name part, e.g., "Rank.superkingdom" -> "superkingdom"
                parts = output.split('.')
                if len(parts) > 1:
                    return parts[-1].split(':')[0].rstrip('>')
            return output
        else:
            return None
    except Exception as e:
        print(f"  ERROR running CLI: {e}")
        return None

 def run_taxoniq_cli_json(tax_id, command='lineage'):
    """Run taxoniq CLI with JSON output."""
    try:
        result = subprocess.run(
            ['taxoniq', command, '--taxon-id', str(tax_id), '--output-format', 'json'],
            capture_output=True,
            text=True,
            timeout=5
        )
        if result.returncode == 0:
            output = result.stdout.strip()
            # Filter out non-JSON lines
            lines = output.split('\n')
            json_lines = []
            in_json = False
            for line in lines:
                line = line.strip()
                if line.startswith('[') or line.startswith('{'):
                    in_json = True
                if in_json:
                    json_lines.append(line)
            
            if json_lines:
                json_str = '\n'.join(json_lines)
                return json.loads(json_str)
            return None
        else:
            return None
    except Exception as e:
        print(f"  ERROR running CLI JSON: {e}")
        return None

 def test_bacteria_kingdom_divergence():
    """Test known divergence: Bacteria kingdom representation."""
    print("\n[Test 1] Bacteria Kingdom Divergence")
    print("-" * 60)
    
    test_cases = [
        (976, "Pseudomonas"),
        (1297, "Thermotoga"),
        (74201, "Helicobacter"),
        (562, "Escherichia coli"),
    ]
    
    for tax_id, name in test_cases:
        print(f"\nTesting {name} (TaxID: {tax_id})")
        sci_name = run_taxoniq_cli(tax_id, 'scientific-name')
        rank = run_taxoniq_cli(tax_id, 'rank')
        
        if sci_name and rank:
            print(f"  Scientific Name: {sci_name}")
            print(f"  Rank: {rank}")
            
            # Get lineage
            lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage')
            if lineage_json and isinstance(lineage_json, list):
                print(f"  Ranked lineage ({len(lineage_json)} entries):")
                for i, entry in enumerate(lineage_json[:10]):  # Limit to first 10
                    if isinstance(entry, dict):
                        print(f"    - {entry.get('scientific_name', 'N/A')} ({entry.get('rank', 'N/A')}) [ID: {entry.get('tax_id', 'N/A')}]")
            else:
                print(f"  Could not parse lineage")
        else:
            print(f"  ✗ Could not fetch data for {tax_id}")

 def test_eukaryota_inclusion():
    """Test known divergence: Eukaryota in lineage."""
    print("\n[Test 2] Eukaryota Inclusion Divergence")
    print("-" * 60)
    
    test_cases = [
        (4751, "Fungi"),
        (6239, "Caenorhabditis elegans"),
        (3239874, "Saccharomyces cerevisiae"),
    ]
    
    for tax_id, name in test_cases:
        print(f"\nTesting {name} (TaxID: {tax_id})")
        sci_name = run_taxoniq_cli(tax_id, 'scientific-name')
        
        if sci_name:
            lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage')
            
            if lineage_json and isinstance(lineage_json, list):
                lineage_ids = [entry.get('tax_id') for entry in lineage_json if isinstance(entry, dict)]
                
                print(f"  Scientific Name: {sci_name}")
                print(f"  Lineage IDs: {lineage_ids}")
                
                # Check for Eukaryota (2759)
                if 2759 in lineage_ids:
                    euk_index = lineage_ids.index(2759)
                    print(f"  ✓ Eukaryota (2759) found at position {euk_index}")
                else:
                    print(f"  ⚠️  Eukaryota (2759) NOT in lineage")
                
                # Print full lineage
                print(f"  Full lineage:")
                for entry in lineage_json:
                    if isinstance(entry, dict):
                        print(f"    - {entry.get('scientific_name')} ({entry.get('rank')}) [ID: {entry.get('tax_id')}]")
            else:
                print(f"  Could not parse lineage JSON")
        else:
            print(f"  ✗ Could not fetch data for {tax_id}")

 def test_missing_taxa():
    """Test taxa known to be missing from Taxoniq."""
    print("\n[Test 3] Missing Taxa in Taxoniq")
    print("-" * 60)
    
    # These taxa are known to be missing from the older Taxoniq DB
    missing_taxa = [
        (1182571, "Candidatus Monteginia"),
        (1505663, "Unknown species 1"),
        (1535326, "Unknown species 2"),
        (1909303, "Unknown species 3"),
        (215579, "Unknown species 4"),
        (270497, "Unknown species 5"),
        (3379134, "k__Pseudomonadati (new kingdom rank)"),
        (424536, "Unknown species 6"),
        (541000, "Unknown species 7"),
    ]
    
    missing_count = 0
    found_count = 0
    
    for tax_id, name in missing_taxa:
        result = run_taxoniq_cli(tax_id, 'scientific-name')
        if result:
            found_count += 1
            rank = run_taxoniq_cli(tax_id, 'rank')
            print(f"\n✓ {name} (ID: {tax_id})")
            print(f"  Scientific name: {result}")
            print(f"  Rank: {rank}")
        else:
            missing_count += 1
            print(f"\n✗ {name} (ID: {tax_id}) - NOT FOUND")
    
    print(f"\n  Summary: {found_count} found, {missing_count} missing")

 def test_rank_name_changes():
    """Test organisms affected by rank name changes (March 2025)."""
    print("\n[Test 4] Rank Name Changes (March 2025 NCBI Update)")
    print("-" * 60)
    
    # These organisms are affected by superkingdom -> domain change
    test_cases = [
        (2, "Bacteria"),
        (2157, "Archaea"),
        (2759, "Eukaryota"),
        (10239, "Viruses"),
    ]
    
    for tax_id, name in test_cases:
        print(f"\nTesting {name} (TaxID: {tax_id})")
        rank = run_taxoniq_cli(tax_id, 'rank')
        
        if rank:
            print(f"  Rank: {rank}")
            
            if rank == 'superkingdom':
                print(f"  ⚠️  Uses old 'superkingdom' rank (pre-March 2025)")
            elif rank == 'domain':
                print(f"  ✓ Uses new 'domain' rank (post-March 2025)")
            elif rank == 'acellular root':
                print(f"  ✓ Viruses use 'acellular root' rank (post-March 2025)")
        else:
            print(f"  ✗ Could not fetch data for {tax_id}")

 def test_representative_genomes():
    """Test RefSeq representative genome availability."""
    print("\n[Test 5] RefSeq Representative Genomes")
    print("-" * 60)
    
    test_cases = [
        (9606, "Homo sapiens"),
        (562, "Escherichia coli"),
        (6239, "Caenorhabditis elegans"),
    ]
    
    for tax_id, name in test_cases:
        print(f"\nTesting {name} (TaxID: {tax_id})")
        result = subprocess.run(
            ['taxoniq', 'refseq-representative-genome-accessions', '--taxon-id', str(tax_id), '--output-format', 'json'],
            capture_output=True,
            text=True,
            timeout=5
        )
        
        if result.returncode == 0:
            try:
                # Parse the output which is a JSON array
                output = result.stdout.strip()
                # Remove any metadata lines
                lines = [l.strip() for l in output.split('\n') if l.strip() and not l.strip().startswith('Taxoniq')]
                json_str = '\n'.join(lines)
                data = json.loads(json_str)
                if isinstance(data, list):
                    print(f"  Found {len(data)} RefSeq representatives")
                    if data:
                        print(f"  First 3: {data[:3]}")
                else:
                    print(f"  Unexpected output format")
            except Exception as e:
                print(f"  Could not parse RefSeq output: {e}")
        else:
            print(f"  ✗ Could not fetch RefSeq data")

 def test_cli_help_and_version():
    """Test basic CLI functionality."""
    print("\n[Test 0] CLI Help and Version")
    print("-" * 60)
    
    # Test help
    result = subprocess.run(['taxoniq', '--help'], capture_output=True, text=True)
    if result.returncode == 0:
        print("✓ 'taxoniq --help' works")
    else:
        print("✗ 'taxoniq --help' failed")
    
    # Test version
    result = subprocess.run(['taxoniq', '--version'], capture_output=True, text=True)
    if result.returncode == 0:
        version = result.stdout.strip()
        print(f"✓ Taxoniq version: {version}")
    else:
        print("✗ Could not get version")

 def main():
    print("=" * 60)
    print("Taxoniq CLI Tests - BugSigDB Issue #248 Divergences")
    print("=" * 60)
    
    try:
        test_cli_help_and_version()
        test_rank_name_changes()
        test_bacteria_kingdom_divergence()
        test_eukaryota_inclusion()
        test_missing_taxa()
        test_representative_genomes()
        
        print("\n" + "=" * 60)
        print("Testing complete!")
        print("=" * 60)
        
    except KeyboardInterrupt:
        print("\n\nTesting interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n\nUnexpected error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

 if __name__ == "__main__":
    main()
diff --git a/test_taxoniq_pr.py b/test_taxoniq_pr.py
 #!/usr/bin/env python3
 """
 Basic Taxoniq PR verification test.

 Tests core functionality and database freshness to ensure a PR
 doesn't break basic Taxoniq operations.
 """

 import taxoniq
 import os
 import datetime
 import sys

 def test_taxoniq_pr():
    print(f"--- Testing Taxoniq PR ---")
    print(f"Python executable: {sys.executable}")
    
    # 1. Check Package Version
    try:
        version = taxoniq.__version__
        print(f"Taxoniq package version: {version}")
    except AttributeError:
        print("Taxoniq package version: Unknown (no __version__ attribute)")

    # 2. Test Basic Functionality
    print("\n[1/3] Testing Basic Lookups...")
    try:
        # Test a well-known taxon (Human)
        human = taxoniq.Taxon(9606)
        print(f"  Query Taxon(9606): {human.scientific_name}")
        
        if human.scientific_name != "Homo sapiens":
            print("  FAILED: Taxon(9606) should be 'Homo sapiens'")
            sys.exit(1)
            
        # Test Rank
        if human.rank.name != "species":
            print(f"  FAILED: Taxon(9606) rank should be 'species', got '{human.rank.name}'")
            sys.exit(1)
            
        # Test Parent (Homininae or Hominidae depending on granularity, usually Homininae ID 207598 or Hominidae ID 9604)
        parent = human.parent
        print(f"  Parent of Human: {parent.scientific_name} (ID: {parent.tax_id})")
        
        print("  PASSED: Basic lookups working.")
    except Exception as e:
        print(f"  FAILED: Exception during basic lookup: {e}")
        sys.exit(1)

    # 3. Test CLI availability (optional check if installed)
    print("\n[2/3] Checking CLI...")
    if os.system("taxoniq --help > /dev/null 2>&1") == 0:
        print("  PASSED: 'taxoniq' CLI is available.")
    else:
        print("  WARNING: 'taxoniq' CLI not found in path (this might be expected if not installed globally).")

    # 4. Inspect Database Freshness
    print("\n[3/3] Inspecting Database Version/Freshness...")
    # Taxoniq bundles data files. We can check their modification times to see if they are recent.
    package_dir = os.path.dirname(taxoniq.__file__)
    print(f"  Package directory: {package_dir}")
    
    # Also check the ncbi_taxon_db package
    import ncbi_taxon_db
    ncbi_taxon_db_dir = os.path.dirname(ncbi_taxon_db.__file__)
    print(f"  NCBI Taxon DB directory: {ncbi_taxon_db_dir}")
    
    data_files = []
    # Walk through both packages to find data files
    for search_dir in [package_dir, ncbi_taxon_db_dir]:
        for root, dirs, files in os.walk(search_dir):
            for file in files:
                # Taxoniq likely uses .marisa, .db, or internal binary formats
                if file.endswith(".marisa") or file.endswith(".db") or file.endswith(".npy") or "index" in file:
                    full_path = os.path.join(root, file)
                    data_files.append(full_path)

    if not data_files:
        print("  WARNING: No obvious data files found to check timestamps.")
    else:
        # Sort by modification time, newest first
        data_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
        
        print("  Most recent data files found:")
        for i, filepath in enumerate(data_files[:5]):
            mtime = os.path.getmtime(filepath)
            dt = datetime.datetime.fromtimestamp(mtime)
            rel_path = os.path.relpath(filepath, os.path.commonpath([package_dir, ncbi_taxon_db_dir]))
            print(f"    - {rel_path}: {dt.strftime('%Y-%m-%d %H:%M:%S')}")

        newest_file_date = datetime.datetime.fromtimestamp(os.path.getmtime(data_files[0]))
        age = datetime.datetime.now() - newest_file_date
        
        print(f"\n  Database Age Estimate: ~{age.days} days old")
        if age.days < 30:
            print("  RESULT: The taxonomy data appears to be RECENT.")
        else:
            print("  RESULT: The taxonomy data might be OLDER (check if this is expected).")

 if __name__ == "__main__":
    test_taxoniq_pr()
diff --git a/UPDATE_TAXONOMY.md b/UPDATE_TAXONOMY.md
Component	Before	After
Taxoniq version	1.0.4	1.0.5 (optional)
NCBI taxonomy DB	2024.9.07	2026.01.29
NCBI RefSeq DB	2024.9.07	2026.01.29
Rank names	superkingdom	domain
Data freshness	Sept 2024	Jan 2026
	#!/bin/bash
	# Quick start script to rebuild Taxoniq with current NCBI taxonomy
	# Usage: ./rebuild_taxoniq.sh [date-version]
	# Example: ./rebuild_taxoniq.sh 2026.01.29
	#
	# Caching: Downloads are cached in ~/.cache/taxoniq-rebuild/ to avoid
	# re-downloading large files on subsequent runs. Set TAXONIQ_CACHE_DIR
	# to override the cache location.

	set -e

	VERSION="${1:-2026.01.29}"
	# Use persistent cache directory instead of /tmp
	CACHE_DIR="${TAXONIQ_CACHE_DIR:-$HOME/.cache/taxoniq-rebuild}"
	WORKDIR="$CACHE_DIR/work-$$"
	TAXONIQ_REPO="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

	echo "=========================================="
	echo "Taxoniq Taxonomy Rebuild Script"
	echo "=========================================="
	echo "Version: $VERSION"
	echo "Repository: $TAXONIQ_REPO"
	echo "Cache directory: $CACHE_DIR"
	echo "Work directory: $WORKDIR"
	echo

	# Step 1: Setup directories
	echo "[1/7] Setting up directories..."
	mkdir -p "$CACHE_DIR"
	mkdir -p "$WORKDIR"
	export BLASTDB="$CACHE_DIR/blast_databases"
	mkdir -p "$BLASTDB"
	cd "$WORKDIR"

	# Step 2: Download NCBI taxonomy dump
	echo "[2/7] Downloading NCBI taxonomy dump..."
	TAXDUMP_FILE="$CACHE_DIR/new_taxdump.tar.gz"
	TAXDUMP_URL="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz"

	if [[ ! -f "$TAXDUMP_FILE" ]]; then
	echo " Fetching taxonomy dump from NCBI..."
	curl -s -o "$TAXDUMP_FILE" "$TAXDUMP_URL"
	echo " ✓ Downloaded taxonomy dump to cache"
	else
	echo " ✓ Using cached taxonomy dump"
	fi

	if [[ ! -f nodes.dmp ]]; then
	echo " Extracting taxonomy files..."
	tar -xzf "$TAXDUMP_FILE"
	echo " ✓ Taxonomy files ready"
	else
	echo " ✓ Using cached extracted taxonomy files"
	fi

	# Step 3: Get latest BLAST database info
	echo "[3/7] Getting NCBI BLAST database metadata..."
	LATEST_DIR_FILE="$CACHE_DIR/latest-dir"
	if [[ ! -f "$LATEST_DIR_FILE" ]]; then
	echo " Fetching BLAST database version info..."
	aws s3 cp --no-sign-request s3://ncbi-blast-databases/latest-dir "$LATEST_DIR_FILE" 2>/dev/null \|\| {
	echo " ⚠️ Could not fetch latest BLAST version, using fallback"
	echo "2026-01-29T00-43-36" > "$LATEST_DIR_FILE"
	}
	echo " ✓ Downloaded BLAST version info to cache"
	else
	echo " ✓ Using cached BLAST version info"
	fi
	BLAST_VERSION=$(cat "$LATEST_DIR_FILE")
	echo " ✓ BLAST database version: $BLAST_VERSION"

	# Step 4: Download representative BLAST databases
	echo "[4/7] Downloading representative BLAST databases..."
	BLAST_CACHE_DIR="$CACHE_DIR/blast_databases/$BLAST_VERSION"
	mkdir -p "$BLAST_CACHE_DIR"

	# Check if we already have the BLAST databases
	BLAST_FILES_EXIST=false
	if [[ -d "$BLAST_CACHE_DIR" ]] && [[ -n "$(find "$BLAST_CACHE_DIR" -name "ref__rep_genomes" -type f 2>/dev/null \| head -1)" ]]; then
	BLAST_FILES_EXIST=true
	echo " ✓ Using cached BLAST databases"
	else
	echo " This may take 5-15 minutes (~20-30 GB)..."
	aws s3 sync --no-sign-request \
	s3://ncbi-blast-databases/$BLAST_VERSION/ "$BLAST_CACHE_DIR" \
	--exclude "*" \
	--include "ref_prok_rep_genomes*" \
	--include "ref_euk_rep_genomes*" \
	--include "ref_viruses_rep_genomes*" \
	2>/dev/null \|\| {
	echo " ⚠️ Warning: Could not download all BLAST databases"
	echo " You can manually download them and set BLASTDB=$BLAST_CACHE_DIR"
	BLAST_FILES_EXIST=false
	}
	if [[ $? -eq 0 ]]; then
	echo " ✓ BLAST databases downloaded to cache"
	BLAST_FILES_EXIST=true
	fi
	fi

	# Set BLASTDB to the cached location
	export BLASTDB="$BLAST_CACHE_DIR"

	# Step 5: Copy data and rebuild indexes
	echo "[5/7] Rebuilding Taxoniq indexes..."
	export PYTHONPATH="$TAXONIQ_REPO:$PYTHONPATH"
	cd "$TAXONIQ_REPO"
	cp "$WORKDIR"/nodes.dmp .
	cp "$WORKDIR"/names.dmp .
	[[ -f "$WORKDIR/merged.dmp" ]] && cp "$WORKDIR/merged.dmp" .
	[[ -f "$WORKDIR/delnodes.dmp" ]] && cp "$WORKDIR/delnodes.dmp" .

	echo " Building indexes (this takes 10-30 minutes)..."
	python3 -m taxoniq.build trees \|\| {
	echo " ✗ Build failed!"
	exit 1
	}
	echo " ✓ Indexes rebuilt"

	# Step 6: Update version numbers
	echo "[6/7] Updating version numbers to $VERSION..."
	python3 << EOF
	import re
	import os

	files_to_update = [
	"setup.py",
	"db_packages/ncbi_taxon_db/setup.py",
	"db_packages/ncbi_refseq_accession_db/setup.py",
	"db_packages/ncbi_refseq_accession_lengths/setup.py",
	"db_packages/ncbi_refseq_accession_offsets/setup.py",
	]

	for filepath in files_to_update:
	if not os.path.exists(filepath):
	print(f" ⚠️ {filepath} not found")
	continue

	with open(filepath, 'r') as f:
	content = f.read()

	# Update version number
	updated = re.sub(
	r'version\s=\s["\'][\d.]+["\']',
	f'version="{os.environ.get("VERSION", "2026.01.29")}""',
	content
	)

	# Update ncbi-taxon-db requirement
	updated = re.sub(
	r'ncbi-taxon-db\s>=\s[\d.]+',
	f'ncbi-taxon-db >= {os.environ.get("VERSION", "2026.01.29")}',
	updated
	)
	updated = re.sub(
	r'ncbi-taxon-db\s==\s[\d.]+',
	f'ncbi-taxon-db == {os.environ.get("VERSION", "2026.01.29")}',
	updated
	)

	# Update other package requirements
	for pkg in ['ncbi-refseq-accession-db', 'ncbi-refseq-accession-lengths', 'ncbi-refseq-accession-offsets']:
	updated = re.sub(
	rf'{pkg}\s==\s[\d.]+',
	f'{pkg} == {os.environ.get("VERSION", "2026.01.29")}',
	updated
	)

	with open(filepath, 'w') as f:
	f.write(updated)

	print(f" ✓ Updated {filepath}")

	EOF

	# Step 7: Test and report
	echo "[7/7] Testing updated Taxoniq..."
	pip install --force-reinstall --no-cache-dir -q -e . 2>/dev/null
	pip install --force-reinstall --no-cache-dir -q \
	db_packages/ncbi_taxon_db \
	db_packages/ncbi_refseq_accession_db \
	db_packages/ncbi_refseq_accession_lengths \
	db_packages/ncbi_refseq_accession_offsets 2>/dev/null

	python3 << EOF
	import taxoniq
	import sys

	try:
	# Test basic functionality
	t = taxoniq.Taxon(9606) # Human
	assert t.scientific_name == "Homo sapiens", "Human lookup failed"

	# Test rank names
	bacteria = taxoniq.Taxon(2)
	print(f" ✓ Bacteria rank: {bacteria.rank.name}")

	# Test lineage
	e_coli = taxoniq.Taxon(562)
	lineage_ranks = [entry.rank.name for entry in e_coli.ranked_lineage]
	print(f" ✓ E. coli lineage has {len(e_coli.ranked_lineage)} entries")

	# Test RefSeq
	h_sapiens = taxoniq.Taxon(9606)
	genomes = h_sapiens.refseq_representative_genome_accessions
	print(f" ✓ H. sapiens has {len(genomes)} RefSeq representative genomes")

	print("\n✅ All tests passed!")

	except Exception as e:
	print(f"❌ Test failed: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	EOF

	# Final summary
	echo
	echo "=========================================="
	echo "✅ Rebuild Complete!"
	echo "=========================================="
	echo
	echo "Summary:"
	echo " Version: $VERSION"
	echo " Cache directory: $CACHE_DIR"
	echo " Work directory: $WORKDIR"
	echo " Updated files:"
	echo " - setup.py"
	echo " - db_packages/*/setup.py"
	echo " - db_packages//ncbi_/version.py"
	echo
	echo "Next steps:"
	echo " 1. Review changes: git diff"
	echo " 2. Test thoroughly: python3 test_bugsigdb_divergences.py"
	echo " 3. Commit: git add -A && git commit -m 'Update to NCBI taxonomy $VERSION'"
	echo " 4. Tag: git tag -a v1.0.5 -m 'Taxoniq v1.0.5 with NCBI taxonomy $VERSION'"
	echo
	echo "Optional cleanup:"
	echo " rm -rf $WORKDIR"
	echo " rm -rf $CACHE_DIR # Remove all cached files"
	echo
	#!/bin/bash
	# Quick reference for running BugSigDB divergence tests

	set -e

	echo "======================================================================"
	echo "Taxoniq BugSigDB Divergence Tests - Quick Start"
	echo "======================================================================"
	echo ""

	# Check if venv exists
	if [ ! -d ".venv" ]; then
	echo "Creating virtual environment..."
	python -m venv .venv
	fi

	# Activate venv
	echo "Activating virtual environment..."
	source .venv/bin/activate

	# Install dependencies if needed
	echo "Installing taxoniq..."
	pip install -q -e . 2>/dev/null \|\| true

	echo ""
	echo "======================================================================"
	echo "[Test 1/3] Running BugSigDB Divergences Test (Python API)"
	echo "======================================================================"
	python test_bugsigdb_divergences.py

	echo ""
	echo "======================================================================"
	echo "[Test 2/3] Running PR Test"
	echo "======================================================================"
	python test_taxoniq_pr.py

	echo ""
	echo "======================================================================"
	echo "[Test 3/3] Running CLI Divergences Test"
	echo "======================================================================"
	python test_taxoniq_cli_divergences.py \|\| echo "Note: Some CLI tests may have format differences"

	echo ""
	echo "======================================================================"
	echo "All tests complete!"
	echo "======================================================================"
	echo ""
	echo "For detailed results and analysis, see BUGSIGDB_TESTING.md"
	#!/usr/bin/env python3
	"""
	Test Taxoniq Python API against known divergences identified in BugSigDB issue #248.

	This test uses the Python API to directly test taxonomy lookups and compares them
	to known divergences between Taxoniq and the NCBI API.

	Reference: https://github.com/waldronlab/BugSigDB/issues/248
	"""

	import taxoniq
	import sys

	def test_rank_name_changes():
	"""Test organisms affected by rank name changes (March 2025 NCBI update)."""
	print("\n[Test 1] Rank Name Changes (March 2025 NCBI Update)")
	print("-" * 70)
	print("Note: Taxoniq uses older NCBI data (Sept 2024)")
	print("Expected: superkingdom rank (not yet updated to 'domain')")
	print()

	test_cases = [
	(2, "Bacteria"),
	(2157, "Archaea"),
	(2759, "Eukaryota"),
	(10239, "Viruses"),
	]

	for tax_id, name in test_cases:
	try:
	t = taxoniq.Taxon(tax_id)
	rank_name = t.rank.name if hasattr(t.rank, 'name') else str(t.rank)
	print(f"{name:20} (ID: {tax_id:5}) - Rank: {rank_name}")

	if rank_name == 'superkingdom':
	print(f" ⚠️ Uses old 'superkingdom' rank (expected for Sept 2024 data)")
	elif rank_name == 'domain':
	print(f" ✓ Uses new 'domain' rank (updated to March 2025 NCBI)")
	except taxoniq.NoValue as e:
	print(f"{name:20} (ID: {tax_id:5}) - NOT FOUND")
	except Exception as e:
	print(f"{name:20} (ID: {tax_id:5}) - ERROR: {e}")

	def test_bacteria_kingdom_divergence():
	"""Test known divergence: Bacteria kingdom representation."""
	print("\n[Test 2] Bacteria Kingdom Divergence")
	print("-" * 70)
	print("Known divergence: Taxoniq may use generic k__Bacteria (2)")
	print("while NCBI may use newer kingdom ranks like Pseudomonadati (3379134)")
	print()

	test_cases = [
	(976, "Pseudomonas", "Should have phylum in lineage"),
	(1297, "Thermotoga", "Should have phylum in lineage"),
	(74201, "Helicobacter", "Should have phylum in lineage"),
	(562, "Escherichia coli", "Species with complete lineage"),
	]

	for tax_id, name, description in test_cases:
	try:
	t = taxoniq.Taxon(tax_id)
	print(f"\n{name} (ID: {tax_id})")
	print(f" {description}")
	print(f" Scientific name: {t.scientific_name}")
	print(f" Rank: {t.rank.name if hasattr(t.rank, 'name') else t.rank}")

	# Get lineage
	lineage = t.ranked_lineage
	print(f" Lineage ({len(lineage)} entries):")
	for i, entry in enumerate(lineage):
	rank_name = entry.rank.name if hasattr(entry.rank, 'name') else str(entry.rank)
	print(f" {i+1}. {entry.scientific_name:30} [{rank_name:15}] (ID: {entry.tax_id})")
	if i >= 7: # Limit output
	print(f" ... ({len(lineage) - 8} more entries)")
	break

	# Check for Bacteria
	bacteria_found = False
	for entry in lineage:
	if 'bacteria' in entry.scientific_name.lower():
	bacteria_found = True
	print(f" ✓ Bacteria entry found: {entry.scientific_name} (ID: {entry.tax_id})")
	break

	if not bacteria_found and tax_id != 562: # E. coli might be different
	print(f" ⚠️ No 'Bacteria' entry in lineage")

	except taxoniq.NoValue as e:
	print(f"{name} (ID: {tax_id}) - NOT FOUND in Taxoniq")
	except Exception as e:
	print(f"{name} (ID: {tax_id}) - ERROR: {e}")

	def test_eukaryota_inclusion():
	"""Test known divergence: Eukaryota in lineage."""
	print("\n[Test 3] Eukaryota Inclusion Divergence")
	print("-" * 70)
	print("Known divergence: Taxoniq includes Eukaryota (2759) in lineages")
	print("while NCBI may start at lower ranks like Fungi (4751)")
	print()

	test_cases = [
	(4751, "Fungi", "Kingdom-level organism"),
	(6239, "Caenorhabditis elegans", "Nematode"),
	(9606, "Homo sapiens", "Mammal/Human"),
	]

	for tax_id, name, description in test_cases:
	try:
	t = taxoniq.Taxon(tax_id)
	print(f"\n{name} (ID: {tax_id})")
	print(f" {description}")
	print(f" Scientific name: {t.scientific_name}")

	lineage = t.ranked_lineage
	lineage_ids = [entry.tax_id for entry in lineage]

	print(f" Lineage IDs: {lineage_ids}")

	# Check for Eukaryota (2759)
	if 2759 in lineage_ids:
	euk_index = lineage_ids.index(2759)
	print(f" ✓ Eukaryota (2759) found at position {euk_index}")
	else:
	print(f" ⚠️ Eukaryota (2759) NOT in lineage (NCBI divergence)")

	# Print lineage with ranks
	print(f" Full lineage ({len(lineage)} entries):")
	for i, entry in enumerate(lineage):
	rank_name = entry.rank.name if hasattr(entry.rank, 'name') else str(entry.rank)
	print(f" {i+1}. {entry.scientific_name:25} [{rank_name:15}] ID: {entry.tax_id}")

	except taxoniq.NoValue as e:
	print(f"{name} (ID: {tax_id}) - NOT FOUND in Taxoniq")
	except Exception as e:
	print(f"{name} (ID: {tax_id}) - ERROR: {e}")

	def test_missing_taxa():
	"""Test taxa known to be missing from Taxoniq."""
	print("\n[Test 4] Missing Taxa in Taxoniq")
	print("-" * 70)
	print("These taxa are known to be missing from the Sept 2024 Taxoniq DB")
	print()

	missing_taxa = [
	(1182571, "Candidatus Monteginia"),
	(1505663, "Unknown species 1"),
	(1535326, "Unknown species 2"),
	(1909303, "Unknown species 3"),
	(3379134, "k__Pseudomonadati (new kingdom rank, March 2025)"),
	(424536, "Unknown species 6"),
	(541000, "Unknown species 7"),
	]

	missing_count = 0
	found_count = 0

	for tax_id, name in missing_taxa:
	try:
	t = taxoniq.Taxon(tax_id)
	found_count += 1
	rank_name = t.rank.name if hasattr(t.rank, 'name') else str(t.rank)
	print(f"✓ {name:40} (ID: {tax_id})")
	print(f" Scientific name: {t.scientific_name}, Rank: {rank_name}")
	except taxoniq.NoValue:
	missing_count += 1
	print(f"✗ {name:40} (ID: {tax_id}) - NOT FOUND")
	except Exception as e:
	print(f"✗ {name:40} (ID: {tax_id}) - ERROR: {e}")

	print(f"\n Summary: {found_count} found, {missing_count} missing")
	if found_count > 0:
	print(f" ⚠️ Database may have been updated since issue was filed")

	def test_refseq_availability():
	"""Test RefSeq genome availability."""
	print("\n[Test 5] RefSeq Representative Genome Availability")
	print("-" * 70)

	test_cases = [
	(9606, "Homo sapiens", "Human"),
	(562, "Escherichia coli", "E. coli"),
	(6239, "Caenorhabditis elegans", "Worm"),
	]

	for tax_id, name, common in test_cases:
	try:
	t = taxoniq.Taxon(tax_id)
	print(f"\n{name} ({common})")
	print(f" Taxon ID: {tax_id}")

	# Get representative genomes
	try:
	rep_genomes = t.refseq_representative_genome_accessions
	print(f" ✓ RefSeq representative genomes: {len(rep_genomes)} available")
	if rep_genomes:
	print(f" First 3: {rep_genomes[:3]}")
	except taxoniq.NoValue:
	print(f" ℹ️ No RefSeq representative genomes indexed")
	except Exception as e:
	print(f" ⚠️ Error fetching RefSeq genomes: {e}")

	except Exception as e:
	print(f"{name} ({common}) - ERROR: {e}")

	def test_performance():
	"""Test performance of Taxoniq lookups."""
	print("\n[Test 6] Performance Test")
	print("-" * 70)

	import time

	test_ids = [9606, 562, 4751, 2, 2759, 10239, 131567] # Various taxa

	print(f"Testing {len(test_ids)} taxa lookups...")

	start = time.time()
	successful = 0

	for tax_id in test_ids:
	try:
	t = taxoniq.Taxon(tax_id)
	_ = t.scientific_name # Force actual lookup
	_ = t.rank
	_ = t.ranked_lineage
	successful += 1
	except:
	pass

	elapsed = time.time() - start
	avg_time = (elapsed * 1000) / successful if successful > 0 else 0

	print(f"✓ Completed {successful}/{len(test_ids)} lookups")
	print(f" Total time: {elapsed*1000:.1f} ms")
	print(f" Average per lookup: {avg_time:.1f} ms")
	print(f" Note: NCBI API would take seconds per lookup")

	def main():
	print("=" * 70)
	print("Taxoniq Tests - BugSigDB Issue #248 Divergences")
	print("Using Taxoniq " + taxoniq.__version__)
	print("=" * 70)

	try:
	test_rank_name_changes()
	test_bacteria_kingdom_divergence()
	test_eukaryota_inclusion()
	test_missing_taxa()
	test_refseq_availability()
	test_performance()

	print("\n" + "=" * 70)
	print("Testing complete!")
	print("=" * 70)

	except KeyboardInterrupt:
	print("\n\nTesting interrupted by user")
	sys.exit(1)
	except Exception as e:
	print(f"\n\nUnexpected error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""
	Test Taxoniq CLI against known divergences identified in BugSigDB issue #248.

	This script tests specific taxonomy IDs and scenarios that are known to have
	differences between Taxoniq and the NCBI API.

	Key known divergences:
	1. Taxoniq uses k__Bacteria (2) while NCBI may use newer kingdom ranks like
	k__Pseudomonadati (3379134) or k__Thermotogati (3384194)
	2. Taxoniq includes Eukaryota (2759) in lineages while NCBI may start at lower ranks
	3. Rank name changes: superkingdom -> domain (as of March 2025)
	4. Missing taxa in older Taxoniq DB
	"""

	import json
	import subprocess
	import sys

	def run_taxoniq_cli(tax_id, command='scientific-name'):
	"""Run taxoniq CLI and return the result."""
	try:
	result = subprocess.run(
	['taxoniq', command, '--taxon-id', str(tax_id)],
	capture_output=True,
	text=True,
	timeout=5
	)
	if result.returncode == 0:
	output = result.stdout.strip().strip('"')
	# Handle Enum representation
	if '<' in output and '>' in output:
	# Extract just the name part, e.g., "Rank.superkingdom" -> "superkingdom"
	parts = output.split('.')
	if len(parts) > 1:
	return parts[-1].split(':')[0].rstrip('>')
	return output
	else:
	return None
	except Exception as e:
	print(f" ERROR running CLI: {e}")
	return None

	def run_taxoniq_cli_json(tax_id, command='lineage'):
	"""Run taxoniq CLI with JSON output."""
	try:
	result = subprocess.run(
	['taxoniq', command, '--taxon-id', str(tax_id), '--output-format', 'json'],
	capture_output=True,
	text=True,
	timeout=5
	)
	if result.returncode == 0:
	output = result.stdout.strip()
	# Filter out non-JSON lines
	lines = output.split('\n')
	json_lines = []
	in_json = False
	for line in lines:
	line = line.strip()
	if line.startswith('[') or line.startswith('{'):
	in_json = True
	if in_json:
	json_lines.append(line)

	if json_lines:
	json_str = '\n'.join(json_lines)
	return json.loads(json_str)
	return None
	else:
	return None
	except Exception as e:
	print(f" ERROR running CLI JSON: {e}")
	return None

	def test_bacteria_kingdom_divergence():
	"""Test known divergence: Bacteria kingdom representation."""
	print("\n[Test 1] Bacteria Kingdom Divergence")
	print("-" * 60)

	test_cases = [
	(976, "Pseudomonas"),
	(1297, "Thermotoga"),
	(74201, "Helicobacter"),
	(562, "Escherichia coli"),
	]

	for tax_id, name in test_cases:
	print(f"\nTesting {name} (TaxID: {tax_id})")
	sci_name = run_taxoniq_cli(tax_id, 'scientific-name')
	rank = run_taxoniq_cli(tax_id, 'rank')

	if sci_name and rank:
	print(f" Scientific Name: {sci_name}")
	print(f" Rank: {rank}")

	# Get lineage
	lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage')
	if lineage_json and isinstance(lineage_json, list):
	print(f" Ranked lineage ({len(lineage_json)} entries):")
	for i, entry in enumerate(lineage_json[:10]): # Limit to first 10
	if isinstance(entry, dict):
	print(f" - {entry.get('scientific_name', 'N/A')} ({entry.get('rank', 'N/A')}) [ID: {entry.get('tax_id', 'N/A')}]")
	else:
	print(f" Could not parse lineage")
	else:
	print(f" ✗ Could not fetch data for {tax_id}")

	def test_eukaryota_inclusion():
	"""Test known divergence: Eukaryota in lineage."""
	print("\n[Test 2] Eukaryota Inclusion Divergence")
	print("-" * 60)

	test_cases = [
	(4751, "Fungi"),
	(6239, "Caenorhabditis elegans"),
	(3239874, "Saccharomyces cerevisiae"),
	]

	for tax_id, name in test_cases:
	print(f"\nTesting {name} (TaxID: {tax_id})")
	sci_name = run_taxoniq_cli(tax_id, 'scientific-name')

	if sci_name:
	lineage_json = run_taxoniq_cli_json(tax_id, 'ranked-lineage')

	if lineage_json and isinstance(lineage_json, list):
	lineage_ids = [entry.get('tax_id') for entry in lineage_json if isinstance(entry, dict)]

	print(f" Scientific Name: {sci_name}")
	print(f" Lineage IDs: {lineage_ids}")

	# Check for Eukaryota (2759)
	if 2759 in lineage_ids:
	euk_index = lineage_ids.index(2759)
	print(f" ✓ Eukaryota (2759) found at position {euk_index}")
	else:
	print(f" ⚠️ Eukaryota (2759) NOT in lineage")

	# Print full lineage
	print(f" Full lineage:")
	for entry in lineage_json:
	if isinstance(entry, dict):
	print(f" - {entry.get('scientific_name')} ({entry.get('rank')}) [ID: {entry.get('tax_id')}]")
	else:
	print(f" Could not parse lineage JSON")
	else:
	print(f" ✗ Could not fetch data for {tax_id}")

	def test_missing_taxa():
	"""Test taxa known to be missing from Taxoniq."""
	print("\n[Test 3] Missing Taxa in Taxoniq")
	print("-" * 60)

	# These taxa are known to be missing from the older Taxoniq DB
	missing_taxa = [
	(1182571, "Candidatus Monteginia"),
	(1505663, "Unknown species 1"),
	(1535326, "Unknown species 2"),
	(1909303, "Unknown species 3"),
	(215579, "Unknown species 4"),
	(270497, "Unknown species 5"),
	(3379134, "k__Pseudomonadati (new kingdom rank)"),
	(424536, "Unknown species 6"),
	(541000, "Unknown species 7"),
	]

	missing_count = 0
	found_count = 0

	for tax_id, name in missing_taxa:
	result = run_taxoniq_cli(tax_id, 'scientific-name')
	if result:
	found_count += 1
	rank = run_taxoniq_cli(tax_id, 'rank')
	print(f"\n✓ {name} (ID: {tax_id})")
	print(f" Scientific name: {result}")
	print(f" Rank: {rank}")
	else:
	missing_count += 1
	print(f"\n✗ {name} (ID: {tax_id}) - NOT FOUND")

	print(f"\n Summary: {found_count} found, {missing_count} missing")

	def test_rank_name_changes():
	"""Test organisms affected by rank name changes (March 2025)."""
	print("\n[Test 4] Rank Name Changes (March 2025 NCBI Update)")
	print("-" * 60)

	# These organisms are affected by superkingdom -> domain change
	test_cases = [
	(2, "Bacteria"),
	(2157, "Archaea"),
	(2759, "Eukaryota"),
	(10239, "Viruses"),
	]

	for tax_id, name in test_cases:
	print(f"\nTesting {name} (TaxID: {tax_id})")
	rank = run_taxoniq_cli(tax_id, 'rank')

	if rank:
	print(f" Rank: {rank}")

	if rank == 'superkingdom':
	print(f" ⚠️ Uses old 'superkingdom' rank (pre-March 2025)")
	elif rank == 'domain':
	print(f" ✓ Uses new 'domain' rank (post-March 2025)")
	elif rank == 'acellular root':
	print(f" ✓ Viruses use 'acellular root' rank (post-March 2025)")
	else:
	print(f" ✗ Could not fetch data for {tax_id}")

	def test_representative_genomes():
	"""Test RefSeq representative genome availability."""
	print("\n[Test 5] RefSeq Representative Genomes")
	print("-" * 60)

	test_cases = [
	(9606, "Homo sapiens"),
	(562, "Escherichia coli"),
	(6239, "Caenorhabditis elegans"),
	]

	for tax_id, name in test_cases:
	print(f"\nTesting {name} (TaxID: {tax_id})")
	result = subprocess.run(
	['taxoniq', 'refseq-representative-genome-accessions', '--taxon-id', str(tax_id), '--output-format', 'json'],
	capture_output=True,
	text=True,
	timeout=5
	)

	if result.returncode == 0:
	try:
	# Parse the output which is a JSON array
	output = result.stdout.strip()
	# Remove any metadata lines
	lines = [l.strip() for l in output.split('\n') if l.strip() and not l.strip().startswith('Taxoniq')]
	json_str = '\n'.join(lines)
	data = json.loads(json_str)
	if isinstance(data, list):
	print(f" Found {len(data)} RefSeq representatives")
	if data:
	print(f" First 3: {data[:3]}")
	else:
	print(f" Unexpected output format")
	except Exception as e:
	print(f" Could not parse RefSeq output: {e}")
	else:
	print(f" ✗ Could not fetch RefSeq data")

	def test_cli_help_and_version():
	"""Test basic CLI functionality."""
	print("\n[Test 0] CLI Help and Version")
	print("-" * 60)

	# Test help
	result = subprocess.run(['taxoniq', '--help'], capture_output=True, text=True)
	if result.returncode == 0:
	print("✓ 'taxoniq --help' works")
	else:
	print("✗ 'taxoniq --help' failed")

	# Test version
	result = subprocess.run(['taxoniq', '--version'], capture_output=True, text=True)
	if result.returncode == 0:
	version = result.stdout.strip()
	print(f"✓ Taxoniq version: {version}")
	else:
	print("✗ Could not get version")

	def main():
	print("=" * 60)
	print("Taxoniq CLI Tests - BugSigDB Issue #248 Divergences")
	print("=" * 60)

	try:
	test_cli_help_and_version()
	test_rank_name_changes()
	test_bacteria_kingdom_divergence()
	test_eukaryota_inclusion()
	test_missing_taxa()
	test_representative_genomes()

	print("\n" + "=" * 60)
	print("Testing complete!")
	print("=" * 60)

	except KeyboardInterrupt:
	print("\n\nTesting interrupted by user")
	sys.exit(1)
	except Exception as e:
	print(f"\n\nUnexpected error: {e}")
	import traceback
	traceback.print_exc()
	sys.exit(1)

	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""
	Basic Taxoniq PR verification test.

	Tests core functionality and database freshness to ensure a PR
	doesn't break basic Taxoniq operations.
	"""

	import taxoniq
	import os
	import datetime
	import sys

	def test_taxoniq_pr():
	print(f"--- Testing Taxoniq PR ---")
	print(f"Python executable: {sys.executable}")

	# 1. Check Package Version
	try:
	version = taxoniq.__version__
	print(f"Taxoniq package version: {version}")
	except AttributeError:
	print("Taxoniq package version: Unknown (no __version__ attribute)")

	# 2. Test Basic Functionality
	print("\n[1/3] Testing Basic Lookups...")
	try:
	# Test a well-known taxon (Human)
	human = taxoniq.Taxon(9606)
	print(f" Query Taxon(9606): {human.scientific_name}")

	if human.scientific_name != "Homo sapiens":
	print(" FAILED: Taxon(9606) should be 'Homo sapiens'")
	sys.exit(1)

	# Test Rank
	if human.rank.name != "species":
	print(f" FAILED: Taxon(9606) rank should be 'species', got '{human.rank.name}'")
	sys.exit(1)

	# Test Parent (Homininae or Hominidae depending on granularity, usually Homininae ID 207598 or Hominidae ID 9604)
	parent = human.parent
	print(f" Parent of Human: {parent.scientific_name} (ID: {parent.tax_id})")

	print(" PASSED: Basic lookups working.")
	except Exception as e:
	print(f" FAILED: Exception during basic lookup: {e}")
	sys.exit(1)

	# 3. Test CLI availability (optional check if installed)
	print("\n[2/3] Checking CLI...")
	if os.system("taxoniq --help > /dev/null 2>&1") == 0:
	print(" PASSED: 'taxoniq' CLI is available.")
	else:
	print(" WARNING: 'taxoniq' CLI not found in path (this might be expected if not installed globally).")

	# 4. Inspect Database Freshness
	print("\n[3/3] Inspecting Database Version/Freshness...")
	# Taxoniq bundles data files. We can check their modification times to see if they are recent.
	package_dir = os.path.dirname(taxoniq.__file__)
	print(f" Package directory: {package_dir}")

	# Also check the ncbi_taxon_db package
	import ncbi_taxon_db
	ncbi_taxon_db_dir = os.path.dirname(ncbi_taxon_db.__file__)
	print(f" NCBI Taxon DB directory: {ncbi_taxon_db_dir}")

	data_files = []
	# Walk through both packages to find data files
	for search_dir in [package_dir, ncbi_taxon_db_dir]:
	for root, dirs, files in os.walk(search_dir):
	for file in files:
	# Taxoniq likely uses .marisa, .db, or internal binary formats
	if file.endswith(".marisa") or file.endswith(".db") or file.endswith(".npy") or "index" in file:
	full_path = os.path.join(root, file)
	data_files.append(full_path)

	if not data_files:
	print(" WARNING: No obvious data files found to check timestamps.")
	else:
	# Sort by modification time, newest first
	data_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)

	print(" Most recent data files found:")
	for i, filepath in enumerate(data_files[:5]):
	mtime = os.path.getmtime(filepath)
	dt = datetime.datetime.fromtimestamp(mtime)
	rel_path = os.path.relpath(filepath, os.path.commonpath([package_dir, ncbi_taxon_db_dir]))
	print(f" - {rel_path}: {dt.strftime('%Y-%m-%d %H:%M:%S')}")

	newest_file_date = datetime.datetime.fromtimestamp(os.path.getmtime(data_files[0]))
	age = datetime.datetime.now() - newest_file_date

	print(f"\n Database Age Estimate: ~{age.days} days old")
	if age.days < 30:
	print(" RESULT: The taxonomy data appears to be RECENT.")
	else:
	print(" RESULT: The taxonomy data might be OLDER (check if this is expected).")

	if __name__ == "__main__":
	test_taxoniq_pr()