Skip to content

Instantly share code, notes, and snippets.

@wooparadog
Created September 11, 2025 09:31
Show Gist options
  • Select an option

  • Save wooparadog/ef2ce831e72ff94f10170954cf371efa to your computer and use it in GitHub Desktop.

Select an option

Save wooparadog/ef2ce831e72ff94f10170954cf371efa to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Compare two Apple .xcstrings localization files to ensure they have the same final effects.
This script verifies that both files would produce identical user-facing translations.
"""
import json
import sys
from typing import Dict, Any, Set, List, Tuple
from pathlib import Path
class LocalizationComparator:
def __init__(self, original_path: str, crowdin_path: str):
self.original_path = Path(original_path)
self.crowdin_path = Path(crowdin_path)
self.original_data = None
self.crowdin_data = None
self.differences = []
self.warnings = []
def load_files(self) -> bool:
"""Load and parse both localization files."""
try:
with open(self.original_path, 'r', encoding='utf-8') as f:
self.original_data = json.load(f)
with open(self.crowdin_path, 'r', encoding='utf-8') as f:
self.crowdin_data = json.load(f)
return True
except Exception as e:
print(f"Error loading files: {e}")
return False
def extract_localizations(self, data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
"""
Extract all localizations from a .xcstrings file.
Returns: {string_key: {lang: localization_info, ...}, ...}
"""
localizations = {}
strings_section = data.get('strings', {})
for string_key, string_data in strings_section.items():
localizations[string_key] = {}
# Handle localizations
localization_data = string_data.get('localizations', {})
for lang, lang_data in localization_data.items():
localizations[string_key][lang] = self._extract_lang_data(lang_data)
return localizations
def _extract_lang_data(self, lang_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract relevant data from a language localization entry."""
result = {}
if 'stringUnit' in lang_data:
string_unit = lang_data['stringUnit']
result['value'] = string_unit.get('value', '')
result['state'] = string_unit.get('state', '')
if 'variations' in lang_data:
result['variations'] = lang_data['variations']
return result
def get_effective_value(self, string_key: str, lang: str, localizations: Dict[str, Dict[str, Any]]) -> str:
"""
Get the effective translation value that would be used at runtime.
Handles fallbacks and empty values.
"""
if string_key not in localizations:
return string_key # Fallback to key itself
string_data = localizations[string_key]
# If no localization for this language exists, use key as fallback
if lang not in string_data:
return string_key
lang_data = string_data[lang]
# Handle variations (plurals)
if 'variations' in lang_data:
plural_data = lang_data['variations'].get('plural', {})
# For comparison purposes, use 'other' form if available, otherwise 'one'
if 'other' in plural_data:
return plural_data['other'].get('stringUnit', {}).get('value', string_key)
elif 'one' in plural_data:
return plural_data['one'].get('stringUnit', {}).get('value', string_key)
# Regular string value
value = lang_data.get('value', '')
return value if value else string_key # Fallback to key if empty
def compare_localizations(self) -> bool:
"""Compare the localizations from both files."""
original_localizations = self.extract_localizations(self.original_data)
crowdin_localizations = self.extract_localizations(self.crowdin_data)
# Get all string keys from both files
original_keys = set(original_localizations.keys())
crowdin_keys = set(crowdin_localizations.keys())
# Check for missing keys
missing_in_crowdin = original_keys - crowdin_keys
missing_in_original = crowdin_keys - original_keys
if missing_in_crowdin:
self.differences.append(f"Keys missing in Crowdin file: {missing_in_crowdin}")
if missing_in_original:
self.differences.append(f"Keys missing in original file: {missing_in_original}")
# Get all languages
all_languages = set()
for localizations in [original_localizations, crowdin_localizations]:
for string_data in localizations.values():
all_languages.update(string_data.keys())
# Compare translations for common keys
common_keys = original_keys & crowdin_keys
translation_mismatches = []
significant_mismatches = []
for string_key in common_keys:
for lang in all_languages:
original_value = self.get_effective_value(string_key, lang, original_localizations)
crowdin_value = self.get_effective_value(string_key, lang, crowdin_localizations)
if original_value != crowdin_value:
mismatch = {
'key': string_key,
'language': lang,
'original': original_value,
'crowdin': crowdin_value
}
translation_mismatches.append(mismatch)
# Consider it significant if:
# 1. Original was not falling back to the key (had a real translation)
# 2. Values are meaningfully different (not just empty vs fallback)
if original_value != string_key or (original_value == string_key and crowdin_value != string_key):
# This is a significant change - either we had a translation that changed,
# or we added a translation where there was none
if original_value != string_key and crowdin_value != original_value:
# Real translation changed
significant_mismatches.append(mismatch)
# Report only significant mismatches as errors
if significant_mismatches:
for mismatch in significant_mismatches[:10]: # Limit output to first 10
self.differences.extend([
f"Translation mismatch for '{mismatch['key']}' in {mismatch['language']}:",
f" Original: '{mismatch['original']}'",
f" Crowdin: '{mismatch['crowdin']}'"
])
if len(significant_mismatches) > 10:
self.differences.append(f"... and {len(significant_mismatches) - 10} more significant mismatches")
# Report non-significant ones as warnings (new translations added)
added_translations = len(translation_mismatches) - len(significant_mismatches)
if added_translations > 0:
self.warnings.append(f"Crowdin file added {added_translations} new translations where original used key fallbacks")
return len(self.differences) == 0
def compare_metadata(self) -> None:
"""Compare metadata like source language, comments, etc."""
# Compare source language
orig_source_lang = self.original_data.get('sourceLanguage')
crowdin_source_lang = self.crowdin_data.get('sourceLanguage')
if orig_source_lang != crowdin_source_lang:
self.warnings.append(f"Source language differs: original='{orig_source_lang}', crowdin='{crowdin_source_lang}'")
# Compare comments and extraction states
orig_strings = self.original_data.get('strings', {})
crowdin_strings = self.crowdin_data.get('strings', {})
for key in set(orig_strings.keys()) & set(crowdin_strings.keys()):
orig_comment = orig_strings[key].get('comment')
crowdin_comment = crowdin_strings[key].get('comment')
if orig_comment != crowdin_comment:
self.warnings.append(f"Comment differs for '{key}': original='{orig_comment}', crowdin='{crowdin_comment}'")
orig_extraction = orig_strings[key].get('extractionState')
crowdin_extraction = crowdin_strings[key].get('extractionState')
if orig_extraction != crowdin_extraction:
self.warnings.append(f"Extraction state differs for '{key}': original='{orig_extraction}', crowdin='{crowdin_extraction}'")
def analyze_coverage(self) -> Dict[str, Any]:
"""Analyze localization coverage in both files."""
original_localizations = self.extract_localizations(self.original_data)
crowdin_localizations = self.extract_localizations(self.crowdin_data)
# Get all languages
all_languages = set()
for localizations in [original_localizations, crowdin_localizations]:
for string_data in localizations.values():
all_languages.update(string_data.keys())
coverage_report = {
'languages': sorted(all_languages),
'total_keys': len(set(original_localizations.keys()) | set(crowdin_localizations.keys())),
'original_coverage': {},
'crowdin_coverage': {}
}
for lang in all_languages:
original_count = sum(1 for key, data in original_localizations.items()
if lang in data and data[lang].get('value'))
crowdin_count = sum(1 for key, data in crowdin_localizations.items()
if lang in data and data[lang].get('value'))
coverage_report['original_coverage'][lang] = original_count
coverage_report['crowdin_coverage'][lang] = crowdin_count
return coverage_report
def generate_report(self) -> str:
"""Generate a comprehensive comparison report."""
report_lines = [
"=== Localization Files Comparison Report ===",
f"Original file: {self.original_path}",
f"Crowdin file: {self.crowdin_path}",
""
]
# Coverage analysis
coverage = self.analyze_coverage()
report_lines.extend([
"=== Coverage Analysis ===",
f"Total string keys: {coverage['total_keys']}",
f"Languages found: {', '.join(coverage['languages'])}",
""
])
for lang in coverage['languages']:
orig_count = coverage['original_coverage'].get(lang, 0)
crowdin_count = coverage['crowdin_coverage'].get(lang, 0)
report_lines.append(f"{lang:8}: Original={orig_count:4d}, Crowdin={crowdin_count:4d}")
report_lines.append("")
# Differences
if self.differences:
report_lines.extend([
"=== CRITICAL DIFFERENCES ===",
"The files do NOT have the same final effects:",
""
])
report_lines.extend(self.differences)
else:
report_lines.extend([
"=== SUCCESS ===",
"✓ The files have the same final localization effects!",
"All translations would produce identical user-facing text.",
""
])
# Warnings
if self.warnings:
report_lines.extend([
"",
"=== WARNINGS (Non-critical differences) ===",
])
report_lines.extend(self.warnings)
return "\n".join(report_lines)
def run_comparison(self) -> bool:
"""Run the complete comparison process."""
if not self.load_files():
return False
print("Comparing localization files...")
# Compare the actual localizations
same_effects = self.compare_localizations()
# Compare metadata (non-critical)
self.compare_metadata()
# Print report
print(self.generate_report())
return same_effects
def main():
if len(sys.argv) != 3:
print("Usage: python compare_localizations.py <original_file> <crowdin_file>")
print("Example: python compare_localizations.py Dola/DolaApp/Resources/Localizable.xcstrings NewLocalizable.xcstrings")
sys.exit(1)
original_file = sys.argv[1]
crowdin_file = sys.argv[2]
comparator = LocalizationComparator(original_file, crowdin_file)
same_effects = comparator.run_comparison()
# Exit with appropriate code
sys.exit(0 if same_effects else 1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment