Skip to content

Instantly share code, notes, and snippets.

@ToluClassics
Created September 24, 2025 21:47
Show Gist options
  • Select an option

  • Save ToluClassics/8d2dd4d335bfbc82bd9b52b4416aa3a5 to your computer and use it in GitHub Desktop.

Select an option

Save ToluClassics/8d2dd4d335bfbc82bd9b52b4416aa3a5 to your computer and use it in GitHub Desktop.

Code Design Document: africanlanguages Package

1. Package Structure and Initial Classes

1.1 Core Package Structure

africanlanguages/
├── __init__.py
├── core/
│   ├── __init__.py
│   ├── base.py
│   ├── exceptions.py
│   └── config.py
├── languages/
│   ├── __init__.py
│   ├── models.py
│   ├── registry.py
│   ├── discovery.py
│   └── codes.py
├── text/
│   ├── __init__.py
│   ├── processors.py
│   ├── encoding.py
│   ├── normalization.py
│   └── tokenizers.py
├── names/
│   ├── __init__.py
│   ├── generators.py
│   └── sources.py
├── numbers/
│   ├── __init__.py
│   ├── converters.py
│   └── rules.py
├── evaluation/
│   ├── __init__.py
│   ├── benchmarks.py
│   └── metrics.py
├── data/
│   ├── __init__.py
│   ├── loaders.py
│   └── resources/
└── utils/
    ├── __init__.py
    ├── cache.py
    └── helpers.py

2. Core Classes and Data Models

2.1 Core Base Classes (core/base.py)

from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, field
from enum import Enum

class ProcessingMode(Enum):
    STRICT = "strict"
    LENIENT = "lenient" 
    AUTO = "auto"

@dataclass
class ProcessingResult:
    """Base result class for processing operations"""
    success: bool
    data: Any = None
    errors: List[str] = field(default_factory=list)
    warnings: List[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)

class BaseProcessor(ABC):
    """Abstract base class for all processors"""
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        self.config = config or {}
        self._setup()
    
    @abstractmethod
    def _setup(self) -> None:
        """Initialize processor-specific setup"""
        pass
    
    @abstractmethod
    def process(self, data: Any, **kwargs) -> ProcessingResult:
        """Process input data and return result"""
        pass

class Singleton(type):
    """Metaclass for singleton pattern"""
    _instances = {}
    
    def __call__(cls, *args, **kwargs):
        if cls not in cls._instances:
            cls._instances[cls] = super().__call__(*args, **kwargs)
        return cls._instances[cls]

2.2 Configuration Management (core/config.py)

from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any, List

@dataclass 
class DataPaths:
    """Configuration for data file paths"""
    languages_db: Path = Path("data/languages.json")
    names_dir: Path = Path("data/names/")
    number_rules_dir: Path = Path("data/numbers/")
    cache_dir: Path = Path(".africanlanguages_cache/")

@dataclass
class APIConfig:
    """API configuration settings"""
    timeout: int = 30
    max_retries: int = 3
    rate_limit: int = 100
    enable_caching: bool = True

@dataclass
class PackageConfig:
    """Main package configuration"""
    data_paths: DataPaths = field(default_factory=DataPaths)
    api: APIConfig = field(default_factory=APIConfig)
    default_mode: ProcessingMode = ProcessingMode.AUTO
    log_level: str = "INFO"
    custom_settings: Dict[str, Any] = field(default_factory=dict)

class ConfigManager(metaclass=Singleton):
    """Global configuration manager"""
    
    def __init__(self):
        self.config = PackageConfig()
        self._load_user_config()
    
    def _load_user_config(self) -> None:
        """Load user configuration from file or environment"""
        pass
    
    def get(self, key: str, default: Any = None) -> Any:
        """Get configuration value"""
        pass
    
    def set(self, key: str, value: Any) -> None:
        """Set configuration value"""
        pass

2.3 Custom Exceptions (core/exceptions.py)

class AfricanLanguagesError(Exception):
    """Base exception for the package"""
    pass

class LanguageNotFoundError(AfricanLanguagesError):
    """Raised when a requested language is not found"""
    pass

class InvalidLanguageCodeError(AfricanLanguagesError):
    """Raised when language code format is invalid"""
    pass

class TextProcessingError(AfricanLanguagesError):
    """Raised when text processing fails"""
    pass

class DataLoadError(AfricanLanguagesError):
    """Raised when data loading fails"""
    pass

class ConfigurationError(AfricanLanguagesError):
    """Raised when configuration is invalid"""
    pass

3. Language Model Classes (languages/models.py)

from dataclasses import dataclass, field
from typing import Optional, List, Dict, Set
from enum import Enum

class ScriptType(Enum):
    LATIN = "Latin"
    ARABIC = "Arabic" 
    ETHIOPIC = "Ethiopic"
    TIFINAGH = "Tifinagh"
    INDIGENOUS = "Indigenous"
    MIXED = "Mixed"

class VitalityStatus(Enum):
    SAFE = "safe"
    VULNERABLE = "vulnerable"
    DEFINITELY_ENDANGERED = "definitely_endangered"
    SEVERELY_ENDANGERED = "severely_endangered"
    CRITICALLY_ENDANGERED = "critically_endangered"
    EXTINCT = "extinct"

@dataclass
class GeographicInfo:
    """Geographic information for a language"""
    countries: List[str] = field(default_factory=list)
    regions: List[str] = field(default_factory=list)
    coordinates: Optional[Dict[str, float]] = None  # lat, lng
    urban_areas: List[str] = field(default_factory=list)

@dataclass
class LinguisticInfo:
    """Linguistic classification information"""
    family: Optional[str] = None
    subfamily: Optional[str] = None
    branch: Optional[str] = None
    genus: Optional[str] = None
    classification_path: List[str] = field(default_factory=list)

@dataclass
class WritingSystem:
    """Writing system information"""
    script: ScriptType
    direction: str = "ltr"  # ltr, rtl, ttb
    unicode_ranges: List[str] = field(default_factory=list)
    orthographies: List[str] = field(default_factory=list)
    has_tone_marks: bool = False
    has_diacritics: bool = False

@dataclass
class DemographicInfo:
    """Demographic and vitality information"""
    speaker_count: Optional[int] = None
    l1_speakers: Optional[int] = None
    l2_speakers: Optional[int] = None
    vitality: Optional[VitalityStatus] = None
    year_data: Optional[int] = None

@dataclass
class LanguageCodes:
    """Various language code systems"""
    iso639_1: Optional[str] = None
    iso639_2: Optional[str] = None
    iso639_3: Optional[str] = None
    glottocode: Optional[str] = None
    ethnologue: Optional[str] = None
    wikidata: Optional[str] = None
    alternatives: List[str] = field(default_factory=list)

@dataclass
class Language:
    """Main language data model"""
    name: str
    codes: LanguageCodes
    geographic: GeographicInfo = field(default_factory=GeographicInfo)
    linguistic: LinguisticInfo = field(default_factory=LinguisticInfo)
    writing_systems: List[WritingSystem] = field(default_factory=list)
    demographic: DemographicInfo = field(default_factory=DemographicInfo)
    alternative_names: List[str] = field(default_factory=list)
    dialects: List[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)
    
    def __post_init__(self):
        """Validate and normalize data after initialization"""
        self._validate()
    
    def _validate(self) -> None:
        """Validate language data consistency"""
        pass
    
    def get_primary_code(self) -> Optional[str]:
        """Get the primary language code (prefer ISO 639-3)"""
        pass
    
    def has_script(self, script: ScriptType) -> bool:
        """Check if language uses a specific script"""
        pass
    
    def in_country(self, country: str) -> bool:
        """Check if language is spoken in a country"""
        pass

4. Language Registry and Discovery (languages/registry.py, languages/discovery.py)

# languages/registry.py
from typing import Dict, List, Optional, Iterator, Set
from .models import Language, ScriptType, VitalityStatus

class LanguageRegistry(metaclass=Singleton):
    """Central registry for all languages"""
    
    def __init__(self):
        self._languages: Dict[str, Language] = {}
        self._indexes: Dict[str, Dict[str, Set[str]]] = {}
        self._loaded = False
    
    def load_data(self, data_path: Optional[str] = None) -> None:
        """Load language data from file or default location"""
        pass
    
    def get_language(self, code: str) -> Optional[Language]:
        """Get language by any valid code"""
        pass
    
    def add_language(self, language: Language) -> None:
        """Add a new language to the registry"""
        pass
    
    def _build_indexes(self) -> None:
        """Build search indexes for fast queries"""
        pass
    
    def get_all_languages(self) -> Iterator[Language]:
        """Get all languages as iterator"""
        pass
    
    def search(self, query: str) -> List[Language]:
        """Search languages by name or code"""
        pass

# languages/discovery.py
from typing import List, Optional, Callable, Dict, Any
from .models import Language, ScriptType, VitalityStatus
from .registry import LanguageRegistry

class LanguageQuery:
    """Fluent query interface for language discovery"""
    
    def __init__(self, registry: LanguageRegistry):
        self.registry = registry
        self._filters: List[Callable[[Language], bool]] = []
        self._results: Optional[List[Language]] = None
    
    def by_country(self, country: str) -> 'LanguageQuery':
        """Filter by country"""
        self._filters.append(lambda lang: lang.in_country(country))
        return self
    
    def by_region(self, region: str) -> 'LanguageQuery':
        """Filter by region"""
        self._filters.append(lambda lang: region in lang.geographic.regions)
        return self
    
    def by_family(self, family: str) -> 'LanguageQuery':
        """Filter by language family"""
        self._filters.append(lambda lang: lang.linguistic.family == family)
        return self
    
    def by_script(self, script: ScriptType) -> 'LanguageQuery':
        """Filter by writing script"""
        self._filters.append(lambda lang: lang.has_script(script))
        return self
    
    def by_vitality(self, status: VitalityStatus) -> 'LanguageQuery':
        """Filter by vitality status"""
        self._filters.append(lambda lang: lang.demographic.vitality == status)
        return self
    
    def with_speakers_above(self, count: int) -> 'LanguageQuery':
        """Filter by minimum speaker count"""
        def filter_func(lang):
            return (lang.demographic.speaker_count or 0) >= count
        self._filters.append(filter_func)
        return self
    
    def all(self) -> List[Language]:
        """Execute query and return all results"""
        if self._results is None:
            self._results = self._execute()
        return self._results
    
    def first(self) -> Optional[Language]:
        """Get first result"""
        results = self.all()
        return results[0] if results else None
    
    def count(self) -> int:
        """Get count of results"""
        return len(self.all())
    
    def _execute(self) -> List[Language]:
        """Execute all filters and return results"""
        results = []
        for language in self.registry.get_all_languages():
            if all(f(language) for f in self._filters):
                results.append(language)
        return results

class LanguageDiscovery:
    """Main interface for language discovery"""
    
    def __init__(self, registry: Optional[LanguageRegistry] = None):
        self.registry = registry or LanguageRegistry()
    
    def query(self) -> LanguageQuery:
        """Start a new query"""
        return LanguageQuery(self.registry)
    
    def get_statistics(self) -> Dict[str, Any]:
        """Get overall statistics about languages"""
        pass
    
    def random_language(self, **filters) -> Optional[Language]:
        """Get a random language matching filters"""
        pass

5. Code Conversion System (languages/codes.py)

from typing import Dict, List, Optional, Set
from .models import Language, LanguageCodes
from .registry import LanguageRegistry

class CodeConverter:
    """Convert between different language code systems"""
    
    def __init__(self, registry: LanguageRegistry):
        self.registry = registry
        self._code_mappings: Dict[str, Dict[str, str]] = {}
        self._build_mappings()
    
    def _build_mappings(self) -> None:
        """Build bidirectional code mapping tables"""
        pass
    
    def convert(self, code: str, from_system: str, to_system: str) -> Optional[str]:
        """Convert code from one system to another"""
        pass
    
    def normalize_code(self, code: str) -> Optional[str]:
        """Normalize code to preferred format"""
        pass
    
    def validate_code(self, code: str, system: str) -> bool:
        """Validate if code is valid in system"""
        pass
    
    def get_all_codes(self, language: Language) -> Dict[str, Optional[str]]:
        """Get all available codes for a language"""
        pass

class CodeValidator:
    """Validate language codes"""
    
    @staticmethod
    def is_valid_iso639_1(code: str) -> bool:
        """Validate ISO 639-1 code format"""
        pass
    
    @staticmethod  
    def is_valid_iso639_3(code: str) -> bool:
        """Validate ISO 639-3 code format"""
        pass
    
    @staticmethod
    def is_valid_glottocode(code: str) -> bool:
        """Validate Glottocode format"""
        pass

6. Text Processing Classes (text/processors.py)

from typing import Dict, Any, List, Optional
from ..core.base import BaseProcessor, ProcessingResult, ProcessingMode
from ..languages.models import Language

class TextNormalizer(BaseProcessor):
    """Normalize text according to language-specific rules"""
    
    def __init__(self, language: Optional[Language] = None, **config):
        self.language = language
        super().__init__(config)
    
    def _setup(self) -> None:
        """Initialize normalization rules"""
        pass
    
    def process(self, text: str, **kwargs) -> ProcessingResult:
        """Normalize input text"""
        pass
    
    def normalize_unicode(self, text: str, form: str = "NFC") -> str:
        """Normalize Unicode representation"""
        pass
    
    def fix_encoding(self, text: str) -> ProcessingResult:
        """Detect and fix encoding issues"""
        pass

class DiacriticRestorer(BaseProcessor):
    """Restore missing diacritics in text"""
    
    def __init__(self, language: Language, **config):
        self.language = language
        super().__init__(config)
    
    def _setup(self) -> None:
        """Load diacritic restoration models"""
        pass
    
    def process(self, text: str, **kwargs) -> ProcessingResult:
        """Restore diacritics in text"""
        pass

@dataclass
class TokenizationResult:
    """Result of tokenization operation"""
    tokens: List[str]
    spans: List[tuple]  # (start, end) positions
    metadata: Dict[str, Any] = field(default_factory=dict)

class TokenizerComparator:
    """Compare different tokenizers on African language text"""
    
    def __init__(self):
        self.tokenizers = {}
        self._register_tokenizers()
    
    def _register_tokenizers(self) -> None:
        """Register available tokenizers"""
        pass
    
    def compare(self, text: str, language: Optional[Language] = None) -> Dict[str, TokenizationResult]:
        """Compare tokenization results across different tokenizers"""
        pass
    
    def evaluate(self, text: str, ground_truth: List[str], language: Optional[Language] = None) -> Dict[str, float]:
        """Evaluate tokenizers against ground truth"""
        pass

7. Name Generation (names/generators.py)

from typing import List, Optional, Dict, Any
from ..languages.models import Language
from ..core.base import BaseProcessor, ProcessingResult

@dataclass
class NameGenerationConfig:
    """Configuration for name generation"""
    gender: Optional[str] = None  # male, female, neutral
    count: int = 1
    include_meaning: bool = False
    cultural_context: Optional[str] = None

class NameGenerator(BaseProcessor):
    """Generate authentic names from African languages"""
    
    def __init__(self, language: Language, **config):
        self.language = language
        super().__init__(config)
    
    def _setup(self) -> None:
        """Load name datasets for the language"""
        pass
    
    def process(self, config: NameGenerationConfig, **kwargs) -> ProcessingResult:
        """Generate names based on configuration"""
        pass
    
    def generate_single(self, gender: Optional[str] = None) -> str:
        """Generate a single name"""
        pass
    
    def generate_batch(self, count: int, **kwargs) -> List[str]:
        """Generate multiple names"""
        pass
    
    def get_name_meaning(self, name: str) -> Optional[str]:
        """Get meaning of a name if available"""
        pass

class MultiLanguageNameGenerator:
    """Generate names from multiple African languages"""
    
    def __init__(self):
        self.generators: Dict[str, NameGenerator] = {}
    
    def add_language(self, language: Language) -> None:
        """Add a language for name generation"""
        pass
    
    def generate_from_region(self, region: str, **config) -> List[str]:
        """Generate names from a specific region"""
        pass
    
    def generate_mixed(self, languages: List[Language], **config) -> List[str]:
        """Generate names from multiple languages"""
        pass

8. Number Conversion (numbers/converters.py)

from typing import Dict, Any, Optional
from ..languages.models import Language
from ..core.base import BaseProcessor, ProcessingResult

@dataclass
class NumberConversionRule:
    """Rule for converting numbers to words"""
    pattern: str
    template: str
    conditions: Dict[str, Any] = field(default_factory=dict)

class NumberToWordsConverter(BaseProcessor):
    """Convert numbers to words in African languages"""
    
    def __init__(self, language: Language, **config):
        self.language = language
        self.rules: List[NumberConversionRule] = []
        super().__init__(config)
    
    def _setup(self) -> None:
        """Load conversion rules for the language"""
        pass
    
    def process(self, number: int, **kwargs) -> ProcessingResult:
        """Convert number to words"""
        pass
    
    def convert(self, number: int) -> str:
        """Convert number to words (simple interface)"""
        pass
    
    def add_rule(self, rule: NumberConversionRule) -> None:
        """Add a conversion rule"""
        pass
    
    def _apply_rules(self, number: int) -> str:
        """Apply conversion rules to number"""
        pass

9. Utility Classes (utils/cache.py, utils/helpers.py)

# utils/cache.py
from typing import Any, Optional, Dict
import time
from functools import wraps

class LRUCache:
    """Simple LRU cache implementation"""
    
    def __init__(self, max_size: int = 128):
        self.max_size = max_size
        self.cache: Dict[str, Any] = {}
        self.access_times: Dict[str, float] = {}
    
    def get(self, key: str) -> Optional[Any]:
        """Get value from cache"""
        pass
    
    def set(self, key: str, value: Any) -> None:
        """Set value in cache"""
        pass
    
    def clear(self) -> None:
        """Clear all cache entries"""
        pass

def cached(max_size: int = 128, ttl: Optional[int] = None):
    """Decorator for caching function results"""
    def decorator(func):
        cache = LRUCache(max_size)
        
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Implementation
            pass
        return wrapper
    return decorator

# utils/helpers.py
from typing import List, Dict, Any, Optional
import re

def normalize_string(s: str) -> str:
    """Normalize string for comparison"""
    pass

def fuzzy_match(s1: str, s2: str, threshold: float = 0.8) -> bool:
    """Check if two strings are similar enough"""
    pass

def extract_language_codes(text: str) -> List[str]:
    """Extract language codes from text"""
    pass

def validate_unicode_text(text: str, allowed_scripts: Optional[List[str]] = None) -> bool:
    """Validate if text contains only allowed Unicode scripts"""
    pass

class DataValidator:
    """Validate data consistency and integrity"""
    
    @staticmethod
    def validate_language_data(data: Dict[str, Any]) -> List[str]:
        """Validate language data structure"""
        pass
    
    @staticmethod
    def check_required_fields(data: Dict[str, Any], required: List[str]) -> List[str]:
        """Check for required fields in data"""
        pass

10. Main Package Interface (__init__.py)

"""
africanlanguages - A comprehensive Python package for African language processing
"""

from .languages import Language, LanguageDiscovery, CodeConverter
from .text import TextNormalizer, DiacriticRestorer, TokenizerComparator
from .names import NameGenerator, MultiLanguageNameGenerator  
from .numbers import NumberToWordsConverter
from .core import PackageConfig, ConfigManager

# Version info
__version__ = "0.1.0"
__author__ = "Your Name"
__email__ = "[email protected]"

# Main interfaces
languages = LanguageDiscovery()
config = ConfigManager()

# Convenience functions
def get_language(code: str) -> Optional[Language]:
    """Get a language by code"""
    return languages.registry.get_language(code)

def search_languages(query: str) -> List[Language]:
    """Search languages by name or code"""
    return languages.registry.search(query)

def list_countries() -> List[str]:
    """Get list of all countries with African languages"""
    pass

def list_families() -> List[str]:
    """Get list of all language families"""
    pass

# Package initialization
def _initialize_package():
    """Initialize package resources"""
    pass

_initialize_package()

This initial code design provides a solid foundation with:

  • Clear class hierarchies with abstract base classes
  • Comprehensive data models using dataclasses
  • Fluent query interfaces for ease of use
  • Extensible processor architecture for text processing
  • Singleton patterns for registries and configuration
  • Type hints throughout for better IDE support
  • Error handling with custom exceptions
  • Caching mechanisms for performance
  • Validation utilities for data integrity

The design emphasizes modularity, extensibility, and ease of use while providing a robust foundation for African language processing tasks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment