africanlanguages/
├── __init__.py
├── core/
│ ├── __init__.py
│ ├── base.py
│ ├── exceptions.py
│ └── config.py
├── languages/
│ ├── __init__.py
│ ├── models.py
│ ├── registry.py
│ ├── discovery.py
│ └── codes.py
├── text/
│ ├── __init__.py
│ ├── processors.py
│ ├── encoding.py
│ ├── normalization.py
│ └── tokenizers.py
├── names/
│ ├── __init__.py
│ ├── generators.py
│ └── sources.py
├── numbers/
│ ├── __init__.py
│ ├── converters.py
│ └── rules.py
├── evaluation/
│ ├── __init__.py
│ ├── benchmarks.py
│ └── metrics.py
├── data/
│ ├── __init__.py
│ ├── loaders.py
│ └── resources/
└── utils/
├── __init__.py
├── cache.py
└── helpers.py
from abc import ABC, abstractmethod
from typing import Dict, Any, Optional, List
from dataclasses import dataclass, field
from enum import Enum
class ProcessingMode(Enum):
STRICT = "strict"
LENIENT = "lenient"
AUTO = "auto"
@dataclass
class ProcessingResult:
"""Base result class for processing operations"""
success: bool
data: Any = None
errors: List[str] = field(default_factory=list)
warnings: List[str] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
class BaseProcessor(ABC):
"""Abstract base class for all processors"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
self.config = config or {}
self._setup()
@abstractmethod
def _setup(self) -> None:
"""Initialize processor-specific setup"""
pass
@abstractmethod
def process(self, data: Any, **kwargs) -> ProcessingResult:
"""Process input data and return result"""
pass
class Singleton(type):
"""Metaclass for singleton pattern"""
_instances = {}
def __call__(cls, *args, **kwargs):
if cls not in cls._instances:
cls._instances[cls] = super().__call__(*args, **kwargs)
return cls._instances[cls]from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any, List
@dataclass
class DataPaths:
"""Configuration for data file paths"""
languages_db: Path = Path("data/languages.json")
names_dir: Path = Path("data/names/")
number_rules_dir: Path = Path("data/numbers/")
cache_dir: Path = Path(".africanlanguages_cache/")
@dataclass
class APIConfig:
"""API configuration settings"""
timeout: int = 30
max_retries: int = 3
rate_limit: int = 100
enable_caching: bool = True
@dataclass
class PackageConfig:
"""Main package configuration"""
data_paths: DataPaths = field(default_factory=DataPaths)
api: APIConfig = field(default_factory=APIConfig)
default_mode: ProcessingMode = ProcessingMode.AUTO
log_level: str = "INFO"
custom_settings: Dict[str, Any] = field(default_factory=dict)
class ConfigManager(metaclass=Singleton):
"""Global configuration manager"""
def __init__(self):
self.config = PackageConfig()
self._load_user_config()
def _load_user_config(self) -> None:
"""Load user configuration from file or environment"""
pass
def get(self, key: str, default: Any = None) -> Any:
"""Get configuration value"""
pass
def set(self, key: str, value: Any) -> None:
"""Set configuration value"""
passclass AfricanLanguagesError(Exception):
"""Base exception for the package"""
pass
class LanguageNotFoundError(AfricanLanguagesError):
"""Raised when a requested language is not found"""
pass
class InvalidLanguageCodeError(AfricanLanguagesError):
"""Raised when language code format is invalid"""
pass
class TextProcessingError(AfricanLanguagesError):
"""Raised when text processing fails"""
pass
class DataLoadError(AfricanLanguagesError):
"""Raised when data loading fails"""
pass
class ConfigurationError(AfricanLanguagesError):
"""Raised when configuration is invalid"""
passfrom dataclasses import dataclass, field
from typing import Optional, List, Dict, Set
from enum import Enum
class ScriptType(Enum):
LATIN = "Latin"
ARABIC = "Arabic"
ETHIOPIC = "Ethiopic"
TIFINAGH = "Tifinagh"
INDIGENOUS = "Indigenous"
MIXED = "Mixed"
class VitalityStatus(Enum):
SAFE = "safe"
VULNERABLE = "vulnerable"
DEFINITELY_ENDANGERED = "definitely_endangered"
SEVERELY_ENDANGERED = "severely_endangered"
CRITICALLY_ENDANGERED = "critically_endangered"
EXTINCT = "extinct"
@dataclass
class GeographicInfo:
"""Geographic information for a language"""
countries: List[str] = field(default_factory=list)
regions: List[str] = field(default_factory=list)
coordinates: Optional[Dict[str, float]] = None # lat, lng
urban_areas: List[str] = field(default_factory=list)
@dataclass
class LinguisticInfo:
"""Linguistic classification information"""
family: Optional[str] = None
subfamily: Optional[str] = None
branch: Optional[str] = None
genus: Optional[str] = None
classification_path: List[str] = field(default_factory=list)
@dataclass
class WritingSystem:
"""Writing system information"""
script: ScriptType
direction: str = "ltr" # ltr, rtl, ttb
unicode_ranges: List[str] = field(default_factory=list)
orthographies: List[str] = field(default_factory=list)
has_tone_marks: bool = False
has_diacritics: bool = False
@dataclass
class DemographicInfo:
"""Demographic and vitality information"""
speaker_count: Optional[int] = None
l1_speakers: Optional[int] = None
l2_speakers: Optional[int] = None
vitality: Optional[VitalityStatus] = None
year_data: Optional[int] = None
@dataclass
class LanguageCodes:
"""Various language code systems"""
iso639_1: Optional[str] = None
iso639_2: Optional[str] = None
iso639_3: Optional[str] = None
glottocode: Optional[str] = None
ethnologue: Optional[str] = None
wikidata: Optional[str] = None
alternatives: List[str] = field(default_factory=list)
@dataclass
class Language:
"""Main language data model"""
name: str
codes: LanguageCodes
geographic: GeographicInfo = field(default_factory=GeographicInfo)
linguistic: LinguisticInfo = field(default_factory=LinguisticInfo)
writing_systems: List[WritingSystem] = field(default_factory=list)
demographic: DemographicInfo = field(default_factory=DemographicInfo)
alternative_names: List[str] = field(default_factory=list)
dialects: List[str] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Validate and normalize data after initialization"""
self._validate()
def _validate(self) -> None:
"""Validate language data consistency"""
pass
def get_primary_code(self) -> Optional[str]:
"""Get the primary language code (prefer ISO 639-3)"""
pass
def has_script(self, script: ScriptType) -> bool:
"""Check if language uses a specific script"""
pass
def in_country(self, country: str) -> bool:
"""Check if language is spoken in a country"""
pass# languages/registry.py
from typing import Dict, List, Optional, Iterator, Set
from .models import Language, ScriptType, VitalityStatus
class LanguageRegistry(metaclass=Singleton):
"""Central registry for all languages"""
def __init__(self):
self._languages: Dict[str, Language] = {}
self._indexes: Dict[str, Dict[str, Set[str]]] = {}
self._loaded = False
def load_data(self, data_path: Optional[str] = None) -> None:
"""Load language data from file or default location"""
pass
def get_language(self, code: str) -> Optional[Language]:
"""Get language by any valid code"""
pass
def add_language(self, language: Language) -> None:
"""Add a new language to the registry"""
pass
def _build_indexes(self) -> None:
"""Build search indexes for fast queries"""
pass
def get_all_languages(self) -> Iterator[Language]:
"""Get all languages as iterator"""
pass
def search(self, query: str) -> List[Language]:
"""Search languages by name or code"""
pass
# languages/discovery.py
from typing import List, Optional, Callable, Dict, Any
from .models import Language, ScriptType, VitalityStatus
from .registry import LanguageRegistry
class LanguageQuery:
"""Fluent query interface for language discovery"""
def __init__(self, registry: LanguageRegistry):
self.registry = registry
self._filters: List[Callable[[Language], bool]] = []
self._results: Optional[List[Language]] = None
def by_country(self, country: str) -> 'LanguageQuery':
"""Filter by country"""
self._filters.append(lambda lang: lang.in_country(country))
return self
def by_region(self, region: str) -> 'LanguageQuery':
"""Filter by region"""
self._filters.append(lambda lang: region in lang.geographic.regions)
return self
def by_family(self, family: str) -> 'LanguageQuery':
"""Filter by language family"""
self._filters.append(lambda lang: lang.linguistic.family == family)
return self
def by_script(self, script: ScriptType) -> 'LanguageQuery':
"""Filter by writing script"""
self._filters.append(lambda lang: lang.has_script(script))
return self
def by_vitality(self, status: VitalityStatus) -> 'LanguageQuery':
"""Filter by vitality status"""
self._filters.append(lambda lang: lang.demographic.vitality == status)
return self
def with_speakers_above(self, count: int) -> 'LanguageQuery':
"""Filter by minimum speaker count"""
def filter_func(lang):
return (lang.demographic.speaker_count or 0) >= count
self._filters.append(filter_func)
return self
def all(self) -> List[Language]:
"""Execute query and return all results"""
if self._results is None:
self._results = self._execute()
return self._results
def first(self) -> Optional[Language]:
"""Get first result"""
results = self.all()
return results[0] if results else None
def count(self) -> int:
"""Get count of results"""
return len(self.all())
def _execute(self) -> List[Language]:
"""Execute all filters and return results"""
results = []
for language in self.registry.get_all_languages():
if all(f(language) for f in self._filters):
results.append(language)
return results
class LanguageDiscovery:
"""Main interface for language discovery"""
def __init__(self, registry: Optional[LanguageRegistry] = None):
self.registry = registry or LanguageRegistry()
def query(self) -> LanguageQuery:
"""Start a new query"""
return LanguageQuery(self.registry)
def get_statistics(self) -> Dict[str, Any]:
"""Get overall statistics about languages"""
pass
def random_language(self, **filters) -> Optional[Language]:
"""Get a random language matching filters"""
passfrom typing import Dict, List, Optional, Set
from .models import Language, LanguageCodes
from .registry import LanguageRegistry
class CodeConverter:
"""Convert between different language code systems"""
def __init__(self, registry: LanguageRegistry):
self.registry = registry
self._code_mappings: Dict[str, Dict[str, str]] = {}
self._build_mappings()
def _build_mappings(self) -> None:
"""Build bidirectional code mapping tables"""
pass
def convert(self, code: str, from_system: str, to_system: str) -> Optional[str]:
"""Convert code from one system to another"""
pass
def normalize_code(self, code: str) -> Optional[str]:
"""Normalize code to preferred format"""
pass
def validate_code(self, code: str, system: str) -> bool:
"""Validate if code is valid in system"""
pass
def get_all_codes(self, language: Language) -> Dict[str, Optional[str]]:
"""Get all available codes for a language"""
pass
class CodeValidator:
"""Validate language codes"""
@staticmethod
def is_valid_iso639_1(code: str) -> bool:
"""Validate ISO 639-1 code format"""
pass
@staticmethod
def is_valid_iso639_3(code: str) -> bool:
"""Validate ISO 639-3 code format"""
pass
@staticmethod
def is_valid_glottocode(code: str) -> bool:
"""Validate Glottocode format"""
passfrom typing import Dict, Any, List, Optional
from ..core.base import BaseProcessor, ProcessingResult, ProcessingMode
from ..languages.models import Language
class TextNormalizer(BaseProcessor):
"""Normalize text according to language-specific rules"""
def __init__(self, language: Optional[Language] = None, **config):
self.language = language
super().__init__(config)
def _setup(self) -> None:
"""Initialize normalization rules"""
pass
def process(self, text: str, **kwargs) -> ProcessingResult:
"""Normalize input text"""
pass
def normalize_unicode(self, text: str, form: str = "NFC") -> str:
"""Normalize Unicode representation"""
pass
def fix_encoding(self, text: str) -> ProcessingResult:
"""Detect and fix encoding issues"""
pass
class DiacriticRestorer(BaseProcessor):
"""Restore missing diacritics in text"""
def __init__(self, language: Language, **config):
self.language = language
super().__init__(config)
def _setup(self) -> None:
"""Load diacritic restoration models"""
pass
def process(self, text: str, **kwargs) -> ProcessingResult:
"""Restore diacritics in text"""
pass
@dataclass
class TokenizationResult:
"""Result of tokenization operation"""
tokens: List[str]
spans: List[tuple] # (start, end) positions
metadata: Dict[str, Any] = field(default_factory=dict)
class TokenizerComparator:
"""Compare different tokenizers on African language text"""
def __init__(self):
self.tokenizers = {}
self._register_tokenizers()
def _register_tokenizers(self) -> None:
"""Register available tokenizers"""
pass
def compare(self, text: str, language: Optional[Language] = None) -> Dict[str, TokenizationResult]:
"""Compare tokenization results across different tokenizers"""
pass
def evaluate(self, text: str, ground_truth: List[str], language: Optional[Language] = None) -> Dict[str, float]:
"""Evaluate tokenizers against ground truth"""
passfrom typing import List, Optional, Dict, Any
from ..languages.models import Language
from ..core.base import BaseProcessor, ProcessingResult
@dataclass
class NameGenerationConfig:
"""Configuration for name generation"""
gender: Optional[str] = None # male, female, neutral
count: int = 1
include_meaning: bool = False
cultural_context: Optional[str] = None
class NameGenerator(BaseProcessor):
"""Generate authentic names from African languages"""
def __init__(self, language: Language, **config):
self.language = language
super().__init__(config)
def _setup(self) -> None:
"""Load name datasets for the language"""
pass
def process(self, config: NameGenerationConfig, **kwargs) -> ProcessingResult:
"""Generate names based on configuration"""
pass
def generate_single(self, gender: Optional[str] = None) -> str:
"""Generate a single name"""
pass
def generate_batch(self, count: int, **kwargs) -> List[str]:
"""Generate multiple names"""
pass
def get_name_meaning(self, name: str) -> Optional[str]:
"""Get meaning of a name if available"""
pass
class MultiLanguageNameGenerator:
"""Generate names from multiple African languages"""
def __init__(self):
self.generators: Dict[str, NameGenerator] = {}
def add_language(self, language: Language) -> None:
"""Add a language for name generation"""
pass
def generate_from_region(self, region: str, **config) -> List[str]:
"""Generate names from a specific region"""
pass
def generate_mixed(self, languages: List[Language], **config) -> List[str]:
"""Generate names from multiple languages"""
passfrom typing import Dict, Any, Optional
from ..languages.models import Language
from ..core.base import BaseProcessor, ProcessingResult
@dataclass
class NumberConversionRule:
"""Rule for converting numbers to words"""
pattern: str
template: str
conditions: Dict[str, Any] = field(default_factory=dict)
class NumberToWordsConverter(BaseProcessor):
"""Convert numbers to words in African languages"""
def __init__(self, language: Language, **config):
self.language = language
self.rules: List[NumberConversionRule] = []
super().__init__(config)
def _setup(self) -> None:
"""Load conversion rules for the language"""
pass
def process(self, number: int, **kwargs) -> ProcessingResult:
"""Convert number to words"""
pass
def convert(self, number: int) -> str:
"""Convert number to words (simple interface)"""
pass
def add_rule(self, rule: NumberConversionRule) -> None:
"""Add a conversion rule"""
pass
def _apply_rules(self, number: int) -> str:
"""Apply conversion rules to number"""
pass# utils/cache.py
from typing import Any, Optional, Dict
import time
from functools import wraps
class LRUCache:
"""Simple LRU cache implementation"""
def __init__(self, max_size: int = 128):
self.max_size = max_size
self.cache: Dict[str, Any] = {}
self.access_times: Dict[str, float] = {}
def get(self, key: str) -> Optional[Any]:
"""Get value from cache"""
pass
def set(self, key: str, value: Any) -> None:
"""Set value in cache"""
pass
def clear(self) -> None:
"""Clear all cache entries"""
pass
def cached(max_size: int = 128, ttl: Optional[int] = None):
"""Decorator for caching function results"""
def decorator(func):
cache = LRUCache(max_size)
@wraps(func)
def wrapper(*args, **kwargs):
# Implementation
pass
return wrapper
return decorator
# utils/helpers.py
from typing import List, Dict, Any, Optional
import re
def normalize_string(s: str) -> str:
"""Normalize string for comparison"""
pass
def fuzzy_match(s1: str, s2: str, threshold: float = 0.8) -> bool:
"""Check if two strings are similar enough"""
pass
def extract_language_codes(text: str) -> List[str]:
"""Extract language codes from text"""
pass
def validate_unicode_text(text: str, allowed_scripts: Optional[List[str]] = None) -> bool:
"""Validate if text contains only allowed Unicode scripts"""
pass
class DataValidator:
"""Validate data consistency and integrity"""
@staticmethod
def validate_language_data(data: Dict[str, Any]) -> List[str]:
"""Validate language data structure"""
pass
@staticmethod
def check_required_fields(data: Dict[str, Any], required: List[str]) -> List[str]:
"""Check for required fields in data"""
pass"""
africanlanguages - A comprehensive Python package for African language processing
"""
from .languages import Language, LanguageDiscovery, CodeConverter
from .text import TextNormalizer, DiacriticRestorer, TokenizerComparator
from .names import NameGenerator, MultiLanguageNameGenerator
from .numbers import NumberToWordsConverter
from .core import PackageConfig, ConfigManager
# Version info
__version__ = "0.1.0"
__author__ = "Your Name"
__email__ = "[email protected]"
# Main interfaces
languages = LanguageDiscovery()
config = ConfigManager()
# Convenience functions
def get_language(code: str) -> Optional[Language]:
"""Get a language by code"""
return languages.registry.get_language(code)
def search_languages(query: str) -> List[Language]:
"""Search languages by name or code"""
return languages.registry.search(query)
def list_countries() -> List[str]:
"""Get list of all countries with African languages"""
pass
def list_families() -> List[str]:
"""Get list of all language families"""
pass
# Package initialization
def _initialize_package():
"""Initialize package resources"""
pass
_initialize_package()This initial code design provides a solid foundation with:
- Clear class hierarchies with abstract base classes
- Comprehensive data models using dataclasses
- Fluent query interfaces for ease of use
- Extensible processor architecture for text processing
- Singleton patterns for registries and configuration
- Type hints throughout for better IDE support
- Error handling with custom exceptions
- Caching mechanisms for performance
- Validation utilities for data integrity
The design emphasizes modularity, extensibility, and ease of use while providing a robust foundation for African language processing tasks.