Created
September 11, 2025 09:31
-
-
Save wooparadog/ef2ce831e72ff94f10170954cf371efa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Compare two Apple .xcstrings localization files to ensure they have the same final effects. | |
| This script verifies that both files would produce identical user-facing translations. | |
| """ | |
| import json | |
| import sys | |
| from typing import Dict, Any, Set, List, Tuple | |
| from pathlib import Path | |
| class LocalizationComparator: | |
| def __init__(self, original_path: str, crowdin_path: str): | |
| self.original_path = Path(original_path) | |
| self.crowdin_path = Path(crowdin_path) | |
| self.original_data = None | |
| self.crowdin_data = None | |
| self.differences = [] | |
| self.warnings = [] | |
| def load_files(self) -> bool: | |
| """Load and parse both localization files.""" | |
| try: | |
| with open(self.original_path, 'r', encoding='utf-8') as f: | |
| self.original_data = json.load(f) | |
| with open(self.crowdin_path, 'r', encoding='utf-8') as f: | |
| self.crowdin_data = json.load(f) | |
| return True | |
| except Exception as e: | |
| print(f"Error loading files: {e}") | |
| return False | |
| def extract_localizations(self, data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]: | |
| """ | |
| Extract all localizations from a .xcstrings file. | |
| Returns: {string_key: {lang: localization_info, ...}, ...} | |
| """ | |
| localizations = {} | |
| strings_section = data.get('strings', {}) | |
| for string_key, string_data in strings_section.items(): | |
| localizations[string_key] = {} | |
| # Handle localizations | |
| localization_data = string_data.get('localizations', {}) | |
| for lang, lang_data in localization_data.items(): | |
| localizations[string_key][lang] = self._extract_lang_data(lang_data) | |
| return localizations | |
| def _extract_lang_data(self, lang_data: Dict[str, Any]) -> Dict[str, Any]: | |
| """Extract relevant data from a language localization entry.""" | |
| result = {} | |
| if 'stringUnit' in lang_data: | |
| string_unit = lang_data['stringUnit'] | |
| result['value'] = string_unit.get('value', '') | |
| result['state'] = string_unit.get('state', '') | |
| if 'variations' in lang_data: | |
| result['variations'] = lang_data['variations'] | |
| return result | |
| def get_effective_value(self, string_key: str, lang: str, localizations: Dict[str, Dict[str, Any]]) -> str: | |
| """ | |
| Get the effective translation value that would be used at runtime. | |
| Handles fallbacks and empty values. | |
| """ | |
| if string_key not in localizations: | |
| return string_key # Fallback to key itself | |
| string_data = localizations[string_key] | |
| # If no localization for this language exists, use key as fallback | |
| if lang not in string_data: | |
| return string_key | |
| lang_data = string_data[lang] | |
| # Handle variations (plurals) | |
| if 'variations' in lang_data: | |
| plural_data = lang_data['variations'].get('plural', {}) | |
| # For comparison purposes, use 'other' form if available, otherwise 'one' | |
| if 'other' in plural_data: | |
| return plural_data['other'].get('stringUnit', {}).get('value', string_key) | |
| elif 'one' in plural_data: | |
| return plural_data['one'].get('stringUnit', {}).get('value', string_key) | |
| # Regular string value | |
| value = lang_data.get('value', '') | |
| return value if value else string_key # Fallback to key if empty | |
| def compare_localizations(self) -> bool: | |
| """Compare the localizations from both files.""" | |
| original_localizations = self.extract_localizations(self.original_data) | |
| crowdin_localizations = self.extract_localizations(self.crowdin_data) | |
| # Get all string keys from both files | |
| original_keys = set(original_localizations.keys()) | |
| crowdin_keys = set(crowdin_localizations.keys()) | |
| # Check for missing keys | |
| missing_in_crowdin = original_keys - crowdin_keys | |
| missing_in_original = crowdin_keys - original_keys | |
| if missing_in_crowdin: | |
| self.differences.append(f"Keys missing in Crowdin file: {missing_in_crowdin}") | |
| if missing_in_original: | |
| self.differences.append(f"Keys missing in original file: {missing_in_original}") | |
| # Get all languages | |
| all_languages = set() | |
| for localizations in [original_localizations, crowdin_localizations]: | |
| for string_data in localizations.values(): | |
| all_languages.update(string_data.keys()) | |
| # Compare translations for common keys | |
| common_keys = original_keys & crowdin_keys | |
| translation_mismatches = [] | |
| significant_mismatches = [] | |
| for string_key in common_keys: | |
| for lang in all_languages: | |
| original_value = self.get_effective_value(string_key, lang, original_localizations) | |
| crowdin_value = self.get_effective_value(string_key, lang, crowdin_localizations) | |
| if original_value != crowdin_value: | |
| mismatch = { | |
| 'key': string_key, | |
| 'language': lang, | |
| 'original': original_value, | |
| 'crowdin': crowdin_value | |
| } | |
| translation_mismatches.append(mismatch) | |
| # Consider it significant if: | |
| # 1. Original was not falling back to the key (had a real translation) | |
| # 2. Values are meaningfully different (not just empty vs fallback) | |
| if original_value != string_key or (original_value == string_key and crowdin_value != string_key): | |
| # This is a significant change - either we had a translation that changed, | |
| # or we added a translation where there was none | |
| if original_value != string_key and crowdin_value != original_value: | |
| # Real translation changed | |
| significant_mismatches.append(mismatch) | |
| # Report only significant mismatches as errors | |
| if significant_mismatches: | |
| for mismatch in significant_mismatches[:10]: # Limit output to first 10 | |
| self.differences.extend([ | |
| f"Translation mismatch for '{mismatch['key']}' in {mismatch['language']}:", | |
| f" Original: '{mismatch['original']}'", | |
| f" Crowdin: '{mismatch['crowdin']}'" | |
| ]) | |
| if len(significant_mismatches) > 10: | |
| self.differences.append(f"... and {len(significant_mismatches) - 10} more significant mismatches") | |
| # Report non-significant ones as warnings (new translations added) | |
| added_translations = len(translation_mismatches) - len(significant_mismatches) | |
| if added_translations > 0: | |
| self.warnings.append(f"Crowdin file added {added_translations} new translations where original used key fallbacks") | |
| return len(self.differences) == 0 | |
| def compare_metadata(self) -> None: | |
| """Compare metadata like source language, comments, etc.""" | |
| # Compare source language | |
| orig_source_lang = self.original_data.get('sourceLanguage') | |
| crowdin_source_lang = self.crowdin_data.get('sourceLanguage') | |
| if orig_source_lang != crowdin_source_lang: | |
| self.warnings.append(f"Source language differs: original='{orig_source_lang}', crowdin='{crowdin_source_lang}'") | |
| # Compare comments and extraction states | |
| orig_strings = self.original_data.get('strings', {}) | |
| crowdin_strings = self.crowdin_data.get('strings', {}) | |
| for key in set(orig_strings.keys()) & set(crowdin_strings.keys()): | |
| orig_comment = orig_strings[key].get('comment') | |
| crowdin_comment = crowdin_strings[key].get('comment') | |
| if orig_comment != crowdin_comment: | |
| self.warnings.append(f"Comment differs for '{key}': original='{orig_comment}', crowdin='{crowdin_comment}'") | |
| orig_extraction = orig_strings[key].get('extractionState') | |
| crowdin_extraction = crowdin_strings[key].get('extractionState') | |
| if orig_extraction != crowdin_extraction: | |
| self.warnings.append(f"Extraction state differs for '{key}': original='{orig_extraction}', crowdin='{crowdin_extraction}'") | |
| def analyze_coverage(self) -> Dict[str, Any]: | |
| """Analyze localization coverage in both files.""" | |
| original_localizations = self.extract_localizations(self.original_data) | |
| crowdin_localizations = self.extract_localizations(self.crowdin_data) | |
| # Get all languages | |
| all_languages = set() | |
| for localizations in [original_localizations, crowdin_localizations]: | |
| for string_data in localizations.values(): | |
| all_languages.update(string_data.keys()) | |
| coverage_report = { | |
| 'languages': sorted(all_languages), | |
| 'total_keys': len(set(original_localizations.keys()) | set(crowdin_localizations.keys())), | |
| 'original_coverage': {}, | |
| 'crowdin_coverage': {} | |
| } | |
| for lang in all_languages: | |
| original_count = sum(1 for key, data in original_localizations.items() | |
| if lang in data and data[lang].get('value')) | |
| crowdin_count = sum(1 for key, data in crowdin_localizations.items() | |
| if lang in data and data[lang].get('value')) | |
| coverage_report['original_coverage'][lang] = original_count | |
| coverage_report['crowdin_coverage'][lang] = crowdin_count | |
| return coverage_report | |
| def generate_report(self) -> str: | |
| """Generate a comprehensive comparison report.""" | |
| report_lines = [ | |
| "=== Localization Files Comparison Report ===", | |
| f"Original file: {self.original_path}", | |
| f"Crowdin file: {self.crowdin_path}", | |
| "" | |
| ] | |
| # Coverage analysis | |
| coverage = self.analyze_coverage() | |
| report_lines.extend([ | |
| "=== Coverage Analysis ===", | |
| f"Total string keys: {coverage['total_keys']}", | |
| f"Languages found: {', '.join(coverage['languages'])}", | |
| "" | |
| ]) | |
| for lang in coverage['languages']: | |
| orig_count = coverage['original_coverage'].get(lang, 0) | |
| crowdin_count = coverage['crowdin_coverage'].get(lang, 0) | |
| report_lines.append(f"{lang:8}: Original={orig_count:4d}, Crowdin={crowdin_count:4d}") | |
| report_lines.append("") | |
| # Differences | |
| if self.differences: | |
| report_lines.extend([ | |
| "=== CRITICAL DIFFERENCES ===", | |
| "The files do NOT have the same final effects:", | |
| "" | |
| ]) | |
| report_lines.extend(self.differences) | |
| else: | |
| report_lines.extend([ | |
| "=== SUCCESS ===", | |
| "✓ The files have the same final localization effects!", | |
| "All translations would produce identical user-facing text.", | |
| "" | |
| ]) | |
| # Warnings | |
| if self.warnings: | |
| report_lines.extend([ | |
| "", | |
| "=== WARNINGS (Non-critical differences) ===", | |
| ]) | |
| report_lines.extend(self.warnings) | |
| return "\n".join(report_lines) | |
| def run_comparison(self) -> bool: | |
| """Run the complete comparison process.""" | |
| if not self.load_files(): | |
| return False | |
| print("Comparing localization files...") | |
| # Compare the actual localizations | |
| same_effects = self.compare_localizations() | |
| # Compare metadata (non-critical) | |
| self.compare_metadata() | |
| # Print report | |
| print(self.generate_report()) | |
| return same_effects | |
| def main(): | |
| if len(sys.argv) != 3: | |
| print("Usage: python compare_localizations.py <original_file> <crowdin_file>") | |
| print("Example: python compare_localizations.py Dola/DolaApp/Resources/Localizable.xcstrings NewLocalizable.xcstrings") | |
| sys.exit(1) | |
| original_file = sys.argv[1] | |
| crowdin_file = sys.argv[2] | |
| comparator = LocalizationComparator(original_file, crowdin_file) | |
| same_effects = comparator.run_comparison() | |
| # Exit with appropriate code | |
| sys.exit(0 if same_effects else 1) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment