Last active
December 6, 2024 15:21
-
-
Save slavbar/7bf80918a88b7fa15ae488246d04f67c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public static class DocumentLanguageValidator | |
| { | |
| // Language validation regexes | |
| private static readonly string[] RestrictedLanguagePatterns = new string[] | |
| { | |
| // Arabic (Includes Persian and Urdu) | |
| @"[\u0600-\u06FF\u0750-\u077F]", // Arabic Script and Supplement | |
| // Chinese (Unified Han Characters) | |
| @"[\u4E00-\u9FFF]", // Mandarin and Cantonese | |
| // Tagalog (Philippines) | |
| @"[\u1700-\u171F]", // Tagalog Script | |
| // Nigerian Languages (Latin Extended-B) | |
| // Includes diacritics used in Nigerian languages | |
| @"[\u1EB8-\u1EB9\u1ECC-\u1ECD\u1EE5\u00CD\u00D3\u00DA\u00ED\u00F3\u00FA]", | |
| // Cyrillic (Russian, Ukrainian) | |
| @"[\u0400-\u04FF]", // Cyrillic script for Slavic languages | |
| // Korean (Hangul) | |
| @"[\uAC00-\uD7AF]", // Hangul syllables | |
| // Tamil (South India/Sri Lanka) | |
| @"[\u0B80-\u0BFF]", // Tamil script | |
| // Hindi (North India) | |
| @"[\u0900-\u097F]", // Devanagari script | |
| // Amharic (Ethiopia) | |
| @"[\u1200-\u137F]", // Ethiopic script | |
| // Latin Extended-A (Exclude French characters explicitly) | |
| @"[\u0100-\u017F&&[^\u00C0\u00C2\u00C4\u00C7\u00C8\u00C9\u00CA\u00CB\u00CE\u00CF\u00D4\u00D6\u00DB\u00DC\u0153\u00E6\u00FF]&&[^+\-.,=(){}\[\]<>@#$%^&*|~`\\/:_;?!]]" | |
| }; | |
| public static bool ContainsRestrictedLanguage(string text) | |
| { | |
| return RestrictedLanguagePatterns.Any(pattern => Regex.IsMatch(text, pattern)); | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public class LanguageValidationTests | |
| { | |
| // Restricted | |
| [TestCase("你好,这是中文样本。歡迎來到這個例子。這裡包含不同的方言,如普通話、粵語、閩南語和吳語。", true)] // Chinese (Mandarin, Cantonese, Minnan, Wu) | |
| [TestCase("مرحبا بك في هذا النص", true)] // Arabic | |
| [TestCase("ᜋᜊᜌᜒᜈ᜔ ᜊᜌᜒᜋ᜔᜵", true)] // Tagalog (Philippines) | |
| [TestCase("नमस्ते, यह हिंदी में एक उदाहरण है।", true)] // Hindi (North India) | |
| [TestCase("ሰላም፣ ይህ አማርኛ የጽሑፍ ምሳሌ ነው።", true)] // Amharic (Ethiopia) | |
| [TestCase("안녕하세요, 이것은 한국어 텍스트입니다.", true)] // Korean | |
| [TestCase("Пример текста на русском языке.", true)] // Russian | |
| [TestCase("வணக்கம், இது தமிழ் உரையின் எடுத்துக்காட்டாகும்.", true)] // Tamil (South India/Sri Lanka) | |
| [TestCase("Ọ̀kọ̀ ní láti jẹ́ kí a gbé.", true)] // Nigerian (Yoruba) | |
| [TestCase("Ẹ ku abọ! Ẹ jọ̀ọ́ gbà mi.", true)] // Nigerian (Yoruba polite expression) | |
| // Restricted Mixed Language and Special Characters | |
| [TestCase("مرحبا! كيف حالك؟ + - = ()", true)] // Arabic with special characters | |
| [TestCase("你好! + - , . = ( )", true)] // Chinese with special characters | |
| // Unrestricted | |
| [TestCase("This is a valid English text.", false)] // English | |
| [TestCase("À l’école, j’ai vu une île où l’on mange du gâteau à la crème brûlée. Ça coûtait œuf et âne.", false)] // French | |
| // Special Characters (Unrestricted) | |
| [TestCase("+ - , . = ( ) ? ! : ;", false)] // Common special characters | |
| [TestCase("[]{}<>@#$%^&*", false)] // Additional special characters | |
| [TestCase("| ~ ` \\ / _", false)] // Additional special characters | |
| [TestCase("1234567890", false)] // Numbers should also be allowed | |
| [TestCase("Text with numbers 1234 and symbols + ( ) -", false)] // Mixed text with numbers and symbols | |
| [TestCase("Math equation: (a + b) = c", false)] // Example of common text with special characters | |
| [TestCase("URL example: https://example.com/?key=value", false)] // Text with URL and special characters | |
| [TestCase("Email: [email protected]", false)] // Email addresses | |
| [TestCase("File path: C:\\Users\\Example\\file.txt", false)] // File paths | |
| public void TestContainsRestrictedLanguage(string input, bool mustBeRestricted) | |
| { | |
| bool shouldBeRestricted = DocumentLanguageValidator.ContainsRestrictedLanguage(input); | |
| Assert.That(shouldBeRestricted, Is.EqualTo(mustBeRestricted)); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment