Last active
January 13, 2025 14:32
-
-
Save anotherjames/b30bc805b80e970afc5de1eafd70f48e to your computer and use it in GitHub Desktop.
Drupal form element validation handler that checks a value is mostly Latin characters.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| use Drupal\Component\Utility\Html; | |
| use Drupal\Core\Form\FormStateInterface; | |
| use Drupal\filter\FilterPluginCollection; | |
| /** | |
| * Implements hook_form_alter(). | |
| */ | |
| function latinvalidation_form_alter(&$form, FormStateInterface $form_state, $form_id) { | |
| // Attach validation to main contact form for non-latin characters. | |
| if (!empty($form['#webform_id']) && $form['#webform_id'] == 'get_in_touch') { | |
| $form['elements']['message']['#element_validate'][] = 'latinvalidation_latin_chars_element_validate'; | |
| } | |
| } | |
| /** | |
| * Element validation handler that checks a value is mostly Latin characters. | |
| */ | |
| function latinvalidation_latin_chars_element_validate($element, FormStateInterface $form_state) { | |
| if (!empty($element['#value'])) { | |
| // We've also received a lot of submissions with a common pattern using the | |
| // format [url=http...]...[/url]. Let's convert those to links before HTML | |
| // is stripped. | |
| $meaningful_text = preg_replace('/\[url=([^]]+)]/', '<a href="$1">', str_replace('[/url]', '</a>', $element['#value'])); | |
| // Ignore HTML tags (since tags are usually links). | |
| $meaningful_text = strip_tags($meaningful_text); | |
| // Convert any URLs in plain text to links, using core's URL filter, then | |
| // filter out links entirely. | |
| $plugin_id = 'filter_url'; | |
| $filterPluginManager = \Drupal::getContainer() | |
| ->get('plugin.manager.filter'); | |
| if ($filterPluginManager->getDefinition($plugin_id, FALSE)) { | |
| // Use a filter plugin collection, just like | |
| // \Drupal\filter\Entity\FilterFormat::filters(). | |
| $collection = new FilterPluginCollection($filterPluginManager, [ | |
| $plugin_id => [], | |
| ]); | |
| /** @var \Drupal\filter\Plugin\Filter\FilterUrl $filter */ | |
| $filter = $collection->get($plugin_id); | |
| $meaningful_text = $filter->prepare($meaningful_text, ''); | |
| $meaningful_text = $filter | |
| ->process($meaningful_text, '') | |
| ->getProcessedText(); | |
| $dom = Html::load($meaningful_text); | |
| $xpath = new \DOMXPath($dom); | |
| /** @var \DOMElement $dom_element */ | |
| foreach ($xpath->query('//a') as $dom_element) { | |
| $dom_element->parentNode->removeChild($dom_element); | |
| } | |
| $meaningful_text = Html::decodeEntities(Html::serialize($dom)); | |
| } | |
| // Ignore whitespace. | |
| $meaningful_text = preg_replace('/\s/', '', $meaningful_text); | |
| if (mb_strlen($meaningful_text)) { | |
| // When available, grapheme_strlen() could be used instead (e.g. to count | |
| // composite emojis as single characters), but the regex below doesn't | |
| // count like that anyway, it counts like the simpler mb_strlen() does. We | |
| // don't need to be too accurate as we're just using a vague threshold to | |
| // limit probably-unwanted submissions. All this inaccuracy means is that | |
| // certain emojis (those that are really combinations of emojis) and any | |
| // other multibyte-but-single-grapheme characters are 'unfairly' punished. | |
| // Meanwhile, do ignore punctuation (well, anything in the \p{Common} | |
| // class for regular expressions). | |
| $total_length = mb_strlen($meaningful_text) - preg_match_all('/\\p{Common}/u', $meaningful_text, $matches); | |
| if ($total_length) { | |
| $count = preg_match_all('/[^\\p{Common}\\p{Latin}]/u', $meaningful_text, $matches); | |
| $nonlatin_proportion = $count / $total_length; | |
| // We picked a fixed proportion of 20%, but this could be configurable. | |
| if ($nonlatin_proportion > 0.2) { | |
| $form_state->setError($element, t('We may have trouble trying to understand your message due to the language and/or symbols used in it. Please use plain English instead. You may prefer to <a href="/contact">contact us via phone or email</a>.')); | |
| } | |
| } | |
| else { | |
| $form_state->setError($element, t('Please include your message for us. You may prefer to <a href="/contact">contact us via phone or email</a>.')); | |
| } | |
| } | |
| else { | |
| $form_state->setError($element, t('Please include your message for us. You may prefer to <a href="/contact">contact us via phone or email</a>.')); | |
| } | |
| } | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Written up at https://www.computerminds.co.uk/articles/block-spam-alphabet