wooparadog · September 11, 2025 09:31
diff --git a/compare_localizations.py b/compare_localizations.py
 #!/usr/bin/env python3
 """
 Compare two Apple .xcstrings localization files to ensure they have the same final effects.
 This script verifies that both files would produce identical user-facing translations.
 """

 import json
 import sys
 from typing import Dict, Any, Set, List, Tuple
 from pathlib import Path


 class LocalizationComparator:
    def __init__(self, original_path: str, crowdin_path: str):
        self.original_path = Path(original_path)
        self.crowdin_path = Path(crowdin_path)
        self.original_data = None
        self.crowdin_data = None
        self.differences = []
        self.warnings = []
        
    def load_files(self) -> bool:
        """Load and parse both localization files."""
        try:
            with open(self.original_path, 'r', encoding='utf-8') as f:
                self.original_data = json.load(f)
            with open(self.crowdin_path, 'r', encoding='utf-8') as f:
                self.crowdin_data = json.load(f)
            return True
        except Exception as e:
            print(f"Error loading files: {e}")
            return False
    
    def extract_localizations(self, data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
        """
        Extract all localizations from a .xcstrings file.
        Returns: {string_key: {lang: localization_info, ...}, ...}
        """
        localizations = {}
        strings_section = data.get('strings', {})
        
        for string_key, string_data in strings_section.items():
            localizations[string_key] = {}
            
            # Handle localizations
            localization_data = string_data.get('localizations', {})
            for lang, lang_data in localization_data.items():
                localizations[string_key][lang] = self._extract_lang_data(lang_data)
                
        return localizations
    
    def _extract_lang_data(self, lang_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract relevant data from a language localization entry."""
        result = {}
        
        if 'stringUnit' in lang_data:
            string_unit = lang_data['stringUnit']
            result['value'] = string_unit.get('value', '')
            result['state'] = string_unit.get('state', '')
        
        if 'variations' in lang_data:
            result['variations'] = lang_data['variations']
            
        return result
    
    def get_effective_value(self, string_key: str, lang: str, localizations: Dict[str, Dict[str, Any]]) -> str:
        """
        Get the effective translation value that would be used at runtime.
        Handles fallbacks and empty values.
        """
        if string_key not in localizations:
            return string_key  # Fallback to key itself
            
        string_data = localizations[string_key]
        
        # If no localization for this language exists, use key as fallback
        if lang not in string_data:
            return string_key
            
        lang_data = string_data[lang]
        
        # Handle variations (plurals)
        if 'variations' in lang_data:
            plural_data = lang_data['variations'].get('plural', {})
            # For comparison purposes, use 'other' form if available, otherwise 'one'
            if 'other' in plural_data:
                return plural_data['other'].get('stringUnit', {}).get('value', string_key)
            elif 'one' in plural_data:
                return plural_data['one'].get('stringUnit', {}).get('value', string_key)
        
        # Regular string value
        value = lang_data.get('value', '')
        return value if value else string_key  # Fallback to key if empty
    
    def compare_localizations(self) -> bool:
        """Compare the localizations from both files."""
        original_localizations = self.extract_localizations(self.original_data)
        crowdin_localizations = self.extract_localizations(self.crowdin_data)
        
        # Get all string keys from both files
        original_keys = set(original_localizations.keys())
        crowdin_keys = set(crowdin_localizations.keys())
        
        # Check for missing keys
        missing_in_crowdin = original_keys - crowdin_keys
        missing_in_original = crowdin_keys - original_keys
        
        if missing_in_crowdin:
            self.differences.append(f"Keys missing in Crowdin file: {missing_in_crowdin}")
        
        if missing_in_original:
            self.differences.append(f"Keys missing in original file: {missing_in_original}")
        
        # Get all languages
        all_languages = set()
        for localizations in [original_localizations, crowdin_localizations]:
            for string_data in localizations.values():
                all_languages.update(string_data.keys())
        
        # Compare translations for common keys
        common_keys = original_keys & crowdin_keys
        translation_mismatches = []
        significant_mismatches = []
        
        for string_key in common_keys:
            for lang in all_languages:
                original_value = self.get_effective_value(string_key, lang, original_localizations)
                crowdin_value = self.get_effective_value(string_key, lang, crowdin_localizations)
                
                if original_value != crowdin_value:
                    mismatch = {
                        'key': string_key,
                        'language': lang,
                        'original': original_value,
                        'crowdin': crowdin_value
                    }
                    translation_mismatches.append(mismatch)
                    
                    # Consider it significant if:
                    # 1. Original was not falling back to the key (had a real translation)
                    # 2. Values are meaningfully different (not just empty vs fallback)
                    if original_value != string_key or (original_value == string_key and crowdin_value != string_key):
                        # This is a significant change - either we had a translation that changed,
                        # or we added a translation where there was none
                        if original_value != string_key and crowdin_value != original_value:
                            # Real translation changed
                            significant_mismatches.append(mismatch)
        
        # Report only significant mismatches as errors
        if significant_mismatches:
            for mismatch in significant_mismatches[:10]:  # Limit output to first 10
                self.differences.extend([
                    f"Translation mismatch for '{mismatch['key']}' in {mismatch['language']}:",
                    f"  Original: '{mismatch['original']}'",
                    f"  Crowdin:  '{mismatch['crowdin']}'"
                ])
            if len(significant_mismatches) > 10:
                self.differences.append(f"... and {len(significant_mismatches) - 10} more significant mismatches")
        
        # Report non-significant ones as warnings (new translations added)
        added_translations = len(translation_mismatches) - len(significant_mismatches)
        if added_translations > 0:
            self.warnings.append(f"Crowdin file added {added_translations} new translations where original used key fallbacks")
        
        return len(self.differences) == 0
    
    def compare_metadata(self) -> None:
        """Compare metadata like source language, comments, etc."""
        # Compare source language
        orig_source_lang = self.original_data.get('sourceLanguage')
        crowdin_source_lang = self.crowdin_data.get('sourceLanguage')
        
        if orig_source_lang != crowdin_source_lang:
            self.warnings.append(f"Source language differs: original='{orig_source_lang}', crowdin='{crowdin_source_lang}'")
        
        # Compare comments and extraction states
        orig_strings = self.original_data.get('strings', {})
        crowdin_strings = self.crowdin_data.get('strings', {})
        
        for key in set(orig_strings.keys()) & set(crowdin_strings.keys()):
            orig_comment = orig_strings[key].get('comment')
            crowdin_comment = crowdin_strings[key].get('comment')
            
            if orig_comment != crowdin_comment:
                self.warnings.append(f"Comment differs for '{key}': original='{orig_comment}', crowdin='{crowdin_comment}'")
            
            orig_extraction = orig_strings[key].get('extractionState')
            crowdin_extraction = crowdin_strings[key].get('extractionState')
            
            if orig_extraction != crowdin_extraction:
                self.warnings.append(f"Extraction state differs for '{key}': original='{orig_extraction}', crowdin='{crowdin_extraction}'")
    
    def analyze_coverage(self) -> Dict[str, Any]:
        """Analyze localization coverage in both files."""
        original_localizations = self.extract_localizations(self.original_data)
        crowdin_localizations = self.extract_localizations(self.crowdin_data)
        
        # Get all languages
        all_languages = set()
        for localizations in [original_localizations, crowdin_localizations]:
            for string_data in localizations.values():
                all_languages.update(string_data.keys())
        
        coverage_report = {
            'languages': sorted(all_languages),
            'total_keys': len(set(original_localizations.keys()) | set(crowdin_localizations.keys())),
            'original_coverage': {},
            'crowdin_coverage': {}
        }
        
        for lang in all_languages:
            original_count = sum(1 for key, data in original_localizations.items() 
                               if lang in data and data[lang].get('value'))
            crowdin_count = sum(1 for key, data in crowdin_localizations.items() 
                              if lang in data and data[lang].get('value'))
            
            coverage_report['original_coverage'][lang] = original_count
            coverage_report['crowdin_coverage'][lang] = crowdin_count
        
        return coverage_report
    
    def generate_report(self) -> str:
        """Generate a comprehensive comparison report."""
        report_lines = [
            "=== Localization Files Comparison Report ===",
            f"Original file: {self.original_path}",
            f"Crowdin file:  {self.crowdin_path}",
            ""
        ]
        
        # Coverage analysis
        coverage = self.analyze_coverage()
        report_lines.extend([
            "=== Coverage Analysis ===",
            f"Total string keys: {coverage['total_keys']}",
            f"Languages found: {', '.join(coverage['languages'])}",
            ""
        ])
        
        for lang in coverage['languages']:
            orig_count = coverage['original_coverage'].get(lang, 0)
            crowdin_count = coverage['crowdin_coverage'].get(lang, 0)
            report_lines.append(f"{lang:8}: Original={orig_count:4d}, Crowdin={crowdin_count:4d}")
        
        report_lines.append("")
        
        # Differences
        if self.differences:
            report_lines.extend([
                "=== CRITICAL DIFFERENCES ===",
                "The files do NOT have the same final effects:",
                ""
            ])
            report_lines.extend(self.differences)
        else:
            report_lines.extend([
                "=== SUCCESS ===",
                "✓ The files have the same final localization effects!",
                "All translations would produce identical user-facing text.",
                ""
            ])
        
        # Warnings
        if self.warnings:
            report_lines.extend([
                "",
                "=== WARNINGS (Non-critical differences) ===",
            ])
            report_lines.extend(self.warnings)
        
        return "\n".join(report_lines)
    
    def run_comparison(self) -> bool:
        """Run the complete comparison process."""
        if not self.load_files():
            return False
        
        print("Comparing localization files...")
        
        # Compare the actual localizations
        same_effects = self.compare_localizations()
        
        # Compare metadata (non-critical)
        self.compare_metadata()
        
        # Print report
        print(self.generate_report())
        
        return same_effects


 def main():
    if len(sys.argv) != 3:
        print("Usage: python compare_localizations.py <original_file> <crowdin_file>")
        print("Example: python compare_localizations.py Dola/DolaApp/Resources/Localizable.xcstrings NewLocalizable.xcstrings")
        sys.exit(1)
    
    original_file = sys.argv[1]
    crowdin_file = sys.argv[2]
    
    comparator = LocalizationComparator(original_file, crowdin_file)
    same_effects = comparator.run_comparison()
    
    # Exit with appropriate code
    sys.exit(0 if same_effects else 1)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Compare two Apple .xcstrings localization files to ensure they have the same final effects.
	This script verifies that both files would produce identical user-facing translations.
	"""

	import json
	import sys
	from typing import Dict, Any, Set, List, Tuple
	from pathlib import Path


	class LocalizationComparator:
	def __init__(self, original_path: str, crowdin_path: str):
	self.original_path = Path(original_path)
	self.crowdin_path = Path(crowdin_path)
	self.original_data = None
	self.crowdin_data = None
	self.differences = []
	self.warnings = []

	def load_files(self) -> bool:
	"""Load and parse both localization files."""
	try:
	with open(self.original_path, 'r', encoding='utf-8') as f:
	self.original_data = json.load(f)
	with open(self.crowdin_path, 'r', encoding='utf-8') as f:
	self.crowdin_data = json.load(f)
	return True
	except Exception as e:
	print(f"Error loading files: {e}")
	return False

	def extract_localizations(self, data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
	"""
	Extract all localizations from a .xcstrings file.
	Returns: {string_key: {lang: localization_info, ...}, ...}
	"""
	localizations = {}
	strings_section = data.get('strings', {})

	for string_key, string_data in strings_section.items():
	localizations[string_key] = {}

	# Handle localizations
	localization_data = string_data.get('localizations', {})
	for lang, lang_data in localization_data.items():
	localizations[string_key][lang] = self._extract_lang_data(lang_data)

	return localizations

	def _extract_lang_data(self, lang_data: Dict[str, Any]) -> Dict[str, Any]:
	"""Extract relevant data from a language localization entry."""
	result = {}

	if 'stringUnit' in lang_data:
	string_unit = lang_data['stringUnit']
	result['value'] = string_unit.get('value', '')
	result['state'] = string_unit.get('state', '')

	if 'variations' in lang_data:
	result['variations'] = lang_data['variations']

	return result

	def get_effective_value(self, string_key: str, lang: str, localizations: Dict[str, Dict[str, Any]]) -> str:
	"""
	Get the effective translation value that would be used at runtime.
	Handles fallbacks and empty values.
	"""
	if string_key not in localizations:
	return string_key # Fallback to key itself

	string_data = localizations[string_key]

	# If no localization for this language exists, use key as fallback
	if lang not in string_data:
	return string_key

	lang_data = string_data[lang]

	# Handle variations (plurals)
	if 'variations' in lang_data:
	plural_data = lang_data['variations'].get('plural', {})
	# For comparison purposes, use 'other' form if available, otherwise 'one'
	if 'other' in plural_data:
	return plural_data['other'].get('stringUnit', {}).get('value', string_key)
	elif 'one' in plural_data:
	return plural_data['one'].get('stringUnit', {}).get('value', string_key)

	# Regular string value
	value = lang_data.get('value', '')
	return value if value else string_key # Fallback to key if empty

	def compare_localizations(self) -> bool:
	"""Compare the localizations from both files."""
	original_localizations = self.extract_localizations(self.original_data)
	crowdin_localizations = self.extract_localizations(self.crowdin_data)

	# Get all string keys from both files
	original_keys = set(original_localizations.keys())
	crowdin_keys = set(crowdin_localizations.keys())

	# Check for missing keys
	missing_in_crowdin = original_keys - crowdin_keys
	missing_in_original = crowdin_keys - original_keys

	if missing_in_crowdin:
	self.differences.append(f"Keys missing in Crowdin file: {missing_in_crowdin}")

	if missing_in_original:
	self.differences.append(f"Keys missing in original file: {missing_in_original}")

	# Get all languages
	all_languages = set()
	for localizations in [original_localizations, crowdin_localizations]:
	for string_data in localizations.values():
	all_languages.update(string_data.keys())

	# Compare translations for common keys
	common_keys = original_keys & crowdin_keys
	translation_mismatches = []
	significant_mismatches = []

	for string_key in common_keys:
	for lang in all_languages:
	original_value = self.get_effective_value(string_key, lang, original_localizations)
	crowdin_value = self.get_effective_value(string_key, lang, crowdin_localizations)

	if original_value != crowdin_value:
	mismatch = {
	'key': string_key,
	'language': lang,
	'original': original_value,
	'crowdin': crowdin_value
	}
	translation_mismatches.append(mismatch)

	# Consider it significant if:
	# 1. Original was not falling back to the key (had a real translation)
	# 2. Values are meaningfully different (not just empty vs fallback)
	if original_value != string_key or (original_value == string_key and crowdin_value != string_key):
	# This is a significant change - either we had a translation that changed,
	# or we added a translation where there was none
	if original_value != string_key and crowdin_value != original_value:
	# Real translation changed
	significant_mismatches.append(mismatch)

	# Report only significant mismatches as errors
	if significant_mismatches:
	for mismatch in significant_mismatches[:10]: # Limit output to first 10
	self.differences.extend([
	f"Translation mismatch for '{mismatch['key']}' in {mismatch['language']}:",
	f" Original: '{mismatch['original']}'",
	f" Crowdin: '{mismatch['crowdin']}'"
	])
	if len(significant_mismatches) > 10:
	self.differences.append(f"... and {len(significant_mismatches) - 10} more significant mismatches")

	# Report non-significant ones as warnings (new translations added)
	added_translations = len(translation_mismatches) - len(significant_mismatches)
	if added_translations > 0:
	self.warnings.append(f"Crowdin file added {added_translations} new translations where original used key fallbacks")

	return len(self.differences) == 0

	def compare_metadata(self) -> None:
	"""Compare metadata like source language, comments, etc."""
	# Compare source language
	orig_source_lang = self.original_data.get('sourceLanguage')
	crowdin_source_lang = self.crowdin_data.get('sourceLanguage')

	if orig_source_lang != crowdin_source_lang:
	self.warnings.append(f"Source language differs: original='{orig_source_lang}', crowdin='{crowdin_source_lang}'")

	# Compare comments and extraction states
	orig_strings = self.original_data.get('strings', {})
	crowdin_strings = self.crowdin_data.get('strings', {})

	for key in set(orig_strings.keys()) & set(crowdin_strings.keys()):
	orig_comment = orig_strings[key].get('comment')
	crowdin_comment = crowdin_strings[key].get('comment')

	if orig_comment != crowdin_comment:
	self.warnings.append(f"Comment differs for '{key}': original='{orig_comment}', crowdin='{crowdin_comment}'")

	orig_extraction = orig_strings[key].get('extractionState')
	crowdin_extraction = crowdin_strings[key].get('extractionState')

	if orig_extraction != crowdin_extraction:
	self.warnings.append(f"Extraction state differs for '{key}': original='{orig_extraction}', crowdin='{crowdin_extraction}'")

	def analyze_coverage(self) -> Dict[str, Any]:
	"""Analyze localization coverage in both files."""
	original_localizations = self.extract_localizations(self.original_data)
	crowdin_localizations = self.extract_localizations(self.crowdin_data)

	# Get all languages
	all_languages = set()
	for localizations in [original_localizations, crowdin_localizations]:
	for string_data in localizations.values():
	all_languages.update(string_data.keys())

	coverage_report = {
	'languages': sorted(all_languages),
	'total_keys': len(set(original_localizations.keys()) \| set(crowdin_localizations.keys())),
	'original_coverage': {},
	'crowdin_coverage': {}
	}

	for lang in all_languages:
	original_count = sum(1 for key, data in original_localizations.items()
	if lang in data and data[lang].get('value'))
	crowdin_count = sum(1 for key, data in crowdin_localizations.items()
	if lang in data and data[lang].get('value'))

	coverage_report['original_coverage'][lang] = original_count
	coverage_report['crowdin_coverage'][lang] = crowdin_count

	return coverage_report

	def generate_report(self) -> str:
	"""Generate a comprehensive comparison report."""
	report_lines = [
	"=== Localization Files Comparison Report ===",
	f"Original file: {self.original_path}",
	f"Crowdin file: {self.crowdin_path}",
	""
	]

	# Coverage analysis
	coverage = self.analyze_coverage()
	report_lines.extend([
	"=== Coverage Analysis ===",
	f"Total string keys: {coverage['total_keys']}",
	f"Languages found: {', '.join(coverage['languages'])}",
	""
	])

	for lang in coverage['languages']:
	orig_count = coverage['original_coverage'].get(lang, 0)
	crowdin_count = coverage['crowdin_coverage'].get(lang, 0)
	report_lines.append(f"{lang:8}: Original={orig_count:4d}, Crowdin={crowdin_count:4d}")

	report_lines.append("")

	# Differences
	if self.differences:
	report_lines.extend([
	"=== CRITICAL DIFFERENCES ===",
	"The files do NOT have the same final effects:",
	""
	])
	report_lines.extend(self.differences)
	else:
	report_lines.extend([
	"=== SUCCESS ===",
	"✓ The files have the same final localization effects!",
	"All translations would produce identical user-facing text.",
	""
	])

	# Warnings
	if self.warnings:
	report_lines.extend([
	"",
	"=== WARNINGS (Non-critical differences) ===",
	])
	report_lines.extend(self.warnings)

	return "\n".join(report_lines)

	def run_comparison(self) -> bool:
	"""Run the complete comparison process."""
	if not self.load_files():
	return False

	print("Comparing localization files...")

	# Compare the actual localizations
	same_effects = self.compare_localizations()

	# Compare metadata (non-critical)
	self.compare_metadata()

	# Print report
	print(self.generate_report())

	return same_effects


	def main():
	if len(sys.argv) != 3:
	print("Usage: python compare_localizations.py <original_file> <crowdin_file>")
	print("Example: python compare_localizations.py Dola/DolaApp/Resources/Localizable.xcstrings NewLocalizable.xcstrings")
	sys.exit(1)

	original_file = sys.argv[1]
	crowdin_file = sys.argv[2]

	comparator = LocalizationComparator(original_file, crowdin_file)
	same_effects = comparator.run_comparison()

	# Exit with appropriate code
	sys.exit(0 if same_effects else 1)


	if __name__ == "__main__":
	main()
No results found