Skip to content

Instantly share code, notes, and snippets.

@slavbar
Last active December 6, 2024 15:21
Show Gist options
  • Select an option

  • Save slavbar/7bf80918a88b7fa15ae488246d04f67c to your computer and use it in GitHub Desktop.

Select an option

Save slavbar/7bf80918a88b7fa15ae488246d04f67c to your computer and use it in GitHub Desktop.
public static class DocumentLanguageValidator
{
// Language validation regexes
private static readonly string[] RestrictedLanguagePatterns = new string[]
{
// Arabic (Includes Persian and Urdu)
@"[\u0600-\u06FF\u0750-\u077F]", // Arabic Script and Supplement
// Chinese (Unified Han Characters)
@"[\u4E00-\u9FFF]", // Mandarin and Cantonese
// Tagalog (Philippines)
@"[\u1700-\u171F]", // Tagalog Script
// Nigerian Languages (Latin Extended-B)
// Includes diacritics used in Nigerian languages
@"[\u1EB8-\u1EB9\u1ECC-\u1ECD\u1EE5\u00CD\u00D3\u00DA\u00ED\u00F3\u00FA]",
// Cyrillic (Russian, Ukrainian)
@"[\u0400-\u04FF]", // Cyrillic script for Slavic languages
// Korean (Hangul)
@"[\uAC00-\uD7AF]", // Hangul syllables
// Tamil (South India/Sri Lanka)
@"[\u0B80-\u0BFF]", // Tamil script
// Hindi (North India)
@"[\u0900-\u097F]", // Devanagari script
// Amharic (Ethiopia)
@"[\u1200-\u137F]", // Ethiopic script
// Latin Extended-A (Exclude French characters explicitly)
@"[\u0100-\u017F&&[^\u00C0\u00C2\u00C4\u00C7\u00C8\u00C9\u00CA\u00CB\u00CE\u00CF\u00D4\u00D6\u00DB\u00DC\u0153\u00E6\u00FF]&&[^+\-.,=(){}\[\]<>@#$%^&*|~`\\/:_;?!]]"
};
public static bool ContainsRestrictedLanguage(string text)
{
return RestrictedLanguagePatterns.Any(pattern => Regex.IsMatch(text, pattern));
}
}
public class LanguageValidationTests
{
// Restricted
[TestCase("你好,这是中文样本。歡迎來到這個例子。這裡包含不同的方言,如普通話、粵語、閩南語和吳語。", true)] // Chinese (Mandarin, Cantonese, Minnan, Wu)
[TestCase("مرحبا بك في هذا النص", true)] // Arabic
[TestCase("ᜋᜊᜌᜒᜈ᜔ ᜊᜌᜒᜋ᜔᜵", true)] // Tagalog (Philippines)
[TestCase("नमस्ते, यह हिंदी में एक उदाहरण है।", true)] // Hindi (North India)
[TestCase("ሰላም፣ ይህ አማርኛ የጽሑፍ ምሳሌ ነው።", true)] // Amharic (Ethiopia)
[TestCase("안녕하세요, 이것은 한국어 텍스트입니다.", true)] // Korean
[TestCase("Пример текста на русском языке.", true)] // Russian
[TestCase("வணக்கம், இது தமிழ் உரையின் எடுத்துக்காட்டாகும்.", true)] // Tamil (South India/Sri Lanka)
[TestCase("Ọ̀kọ̀ ní láti jẹ́ kí a gbé.", true)] // Nigerian (Yoruba)
[TestCase("Ẹ ku abọ! Ẹ jọ̀ọ́ gbà mi.", true)] // Nigerian (Yoruba polite expression)
// Restricted Mixed Language and Special Characters
[TestCase("مرحبا! كيف حالك؟ + - = ()", true)] // Arabic with special characters
[TestCase("你好! + - , . = ( )", true)] // Chinese with special characters
// Unrestricted
[TestCase("This is a valid English text.", false)] // English
[TestCase("À l’école, j’ai vu une île où l’on mange du gâteau à la crème brûlée. Ça coûtait œuf et âne.", false)] // French
// Special Characters (Unrestricted)
[TestCase("+ - , . = ( ) ? ! : ;", false)] // Common special characters
[TestCase("[]{}<>@#$%^&*", false)] // Additional special characters
[TestCase("| ~ ` \\ / _", false)] // Additional special characters
[TestCase("1234567890", false)] // Numbers should also be allowed
[TestCase("Text with numbers 1234 and symbols + ( ) -", false)] // Mixed text with numbers and symbols
[TestCase("Math equation: (a + b) = c", false)] // Example of common text with special characters
[TestCase("URL example: https://example.com/?key=value", false)] // Text with URL and special characters
[TestCase("Email: [email protected]", false)] // Email addresses
[TestCase("File path: C:\\Users\\Example\\file.txt", false)] // File paths
public void TestContainsRestrictedLanguage(string input, bool mustBeRestricted)
{
bool shouldBeRestricted = DocumentLanguageValidator.ContainsRestrictedLanguage(input);
Assert.That(shouldBeRestricted, Is.EqualTo(mustBeRestricted));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment