Last active
May 1, 2025 18:16
-
-
Save xarical/47a4fa3dc62fa57e1d0558a058900e3c to your computer and use it in GitHub Desktop.
Testing various methods of automatically detecting readable text: frequency analysis using a custom frequency table, word detection using spaCy and the NLTK words corpus, nonsense string evaluation using Nostril, and using an LLM (GPT-4o-mini, served through G4F)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyMcFk8RyVinIsjebvovpMu9", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/xarical/47a4fa3dc62fa57e1d0558a058900e3c/automatic-detection-of-readable-text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# Testing various methods of automatically detecting readable text" | |
| ], | |
| "metadata": { | |
| "id": "HZEdyinHx1Ns" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Using a frequency table" | |
| ], | |
| "metadata": { | |
| "id": "1wtQnK4RrsxR" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Frequency table (absolute frequency)\n", | |
| "frequency_table = {\n", | |
| " \" \": 275, \"!\": 0, '\"': 2, \"#\": 0, \"$\": 0, \"%\": 0, \"&\": 0, \"'\": 2, \"(\": 0,\n", | |
| " \")\": 0, \"*\": 0, \"+\": 0, \",\": 17, \"-\": 0, \".\": 24, \"/\": 0, \"0\": 2, \"1\": 2,\n", | |
| " \"2\": 0, \"3\": 1, \"4\": 0, \"5\": 1, \"6\": 1, \"7\": 1, \"8\": 0, \"9\": 2, \":\": 0,\n", | |
| " \";\": 0, \"<\": 0, \"=\": 0, \">\": 0, \"?\": 0, \"@\": 0, \"A\": 2, \"B\": 2, \"C\": 2,\n", | |
| " \"D\": 0, \"E\": 2, \"F\": 3, \"G\": 2, \"H\": 3, \"I\": 12, \"J\": 2, \"K\": 0, \"L\": 0,\n", | |
| " \"M\": 1, \"N\": 1, \"O\": 1, \"P\": 1, \"Q\": 0, \"R\": 0, \"S\": 9, \"T\": 6, \"U\": 0,\n", | |
| " \"V\": 0, \"W\": 3, \"X\": 0, \"Y\": 0, \"Z\": 0, \"[\": 0, \"\\\\\": 0, \"]\": 0, \"^\": 0,\n", | |
| " \"_\": 0, \"`\": 0, \"a\": 100, \"b\": 15, \"c\": 22, \"d\": 37, \"e\": 123, \"f\": 31,\n", | |
| " \"g\": 30, \"h\": 59, \"i\": 71, \"j\": 0, \"k\": 7, \"l\": 48, \"m\": 23, \"n\": 66,\n", | |
| " \"o\": 101, \"p\": 14, \"q\": 0, \"r\": 68, \"s\": 74, \"t\": 105, \"u\": 36, \"v\": 10,\n", | |
| " \"w\": 34, \"x\": 0, \"y\": 32, \"z\": 1, \"{\": 0, \"|\": 0, \"}\": 0, \"~\": 0, \"\\x7f\": 0\n", | |
| "}\n", | |
| "\n", | |
| "# Threshold of max avg deviation of percentage for frequency_nonsense\n", | |
| "DEVIATION_THRESHOLD = 1\n", | |
| "\n", | |
| "def calculate_relative_frequencies(text: str) -> dict[str, float]:\n", | |
| " \"\"\"Calculate the relative frequencies of the characters in the text.\"\"\"\n", | |
| " # Collect absolute frequencies\n", | |
| " absolute_frequencies = {}\n", | |
| " for char in text:\n", | |
| " if char in absolute_frequencies:\n", | |
| " absolute_frequencies[char] += 1\n", | |
| " else:\n", | |
| " absolute_frequencies[char] = 1\n", | |
| "\n", | |
| " # Convert to relative frequencies\n", | |
| " relative_frequencies = {\n", | |
| " char: freq / len(text)\n", | |
| " for char, freq in absolute_frequencies.items()\n", | |
| " }\n", | |
| " return relative_frequencies\n", | |
| "\n", | |
| "def frequencies_is_nonsense(text: str) -> bool:\n", | |
| " \"\"\"\n", | |
| " Determine whether or not a given text is nonsense, using a frequency table.\n", | |
| " \"\"\"\n", | |
| " text_relative_frequencies = calculate_relative_frequencies(text)\n", | |
| " frequency_table_sum = sum(frequency_table.values()) # calculate outside loop\n", | |
| " expected_relative_frequencies = {\n", | |
| " char: freq / frequency_table_sum\n", | |
| " for char, freq in frequency_table.items()\n", | |
| " }\n", | |
| "\n", | |
| " # Collect deviations of percentages\n", | |
| " deviations = [\n", | |
| " abs(text_relative_frequencies.get(char, 0) - expected_freq)\n", | |
| " for char, expected_freq in expected_relative_frequencies.items()\n", | |
| " ]\n", | |
| "\n", | |
| " # Average deviations of percentages\n", | |
| " avg_deviation = (sum(deviations) / len(deviations)) * 100\n", | |
| " print(f\"Avg % deviation of # character apperances compared to frequencies: {avg_deviation:.2f}%\")\n", | |
| " return avg_deviation > DEVIATION_THRESHOLD" | |
| ], | |
| "metadata": { | |
| "id": "LASYvPIuN-f-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Using spaCy word detection" | |
| ], | |
| "metadata": { | |
| "id": "7xFNHxHcrxMb" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!pip install -q nltk spacy\n", | |
| "import nltk\n", | |
| "import spacy\n", | |
| "\n", | |
| "# Setup\n", | |
| "nltk.download(\"words\", download_dir=\".\") # download words list in the local dir\n", | |
| "nltk.data.path.append(\".\") # set resource lookup path\n", | |
| "words = set(nltk.corpus.words.words()) # load words list as a set for speed\n", | |
| "nlp = spacy.load(\"en_core_web_sm\") # load spaCy NLP model\n", | |
| "\n", | |
| "# Threshold of min percentage valid words for words_is_nonsense\n", | |
| "VALIDITY_THRESHOLD = 15\n", | |
| "\n", | |
| "def words_is_nonsense(text):\n", | |
| " # Tokenize the text\n", | |
| " tokens = nlp(text)\n", | |
| " num_valid_words = sum(\n", | |
| " 1 for token in tokens if token.lemma_.lower() in words or token.lemma_ in words\n", | |
| " )\n", | |
| " validity_percentage = (num_valid_words / len(tokens)) * 100\n", | |
| " print(f\"% validity of tokens compared to words list: {validity_percentage:.2f}%\")\n", | |
| " return validity_percentage < VALIDITY_THRESHOLD" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "U0GIhgwRdtIU", | |
| "outputId": "61a83d70-a4df-4b79-d19c-6a2766a303a3" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "[nltk_data] Downloading package words to ....\n", | |
| "[nltk_data] Unzipping corpora/words.zip.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Using Nostril nonsense string evaluator" | |
| ], | |
| "metadata": { | |
| "id": "UYwxMfuhrGT4" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!pip install -q git+https://github.com/casics/nostril.git\n", | |
| "from nostril import nonsense as nostril_is_nonsense" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "MSNLwK0ksPYy", | |
| "outputId": "1040291f-c36d-4419-a3db-93763253e5bb" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Building wheel for nostril (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Using an LLM (GPT-4o-mini, served through G4F)" | |
| ], | |
| "metadata": { | |
| "id": "7MuaeALRsw06" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!pip -q install g4f\n", | |
| "from g4f.client import Client\n", | |
| "\n", | |
| "client = Client()\n", | |
| "\n", | |
| "system_prompt = \"\"\"\\\n", | |
| "Determine whether or not a given text is nonsense.\n", | |
| "Respond with True if the text is nonsense and False if the text is readable, human-understandable text.\n", | |
| "Respond with True or False and nothing else.\n", | |
| "\"\"\"\n", | |
| "\n", | |
| "def llm_is_nonsense(text):\n", | |
| " conversation_history = [\n", | |
| " {\"role\": \"system\", \"content\": system_prompt},\n", | |
| " {\"role\": \"user\", \"content\": text}\n", | |
| " ]\n", | |
| " c = client.chat.completions.create(\n", | |
| " model=\"gpt-4o-mini\",\n", | |
| " temperature=0,\n", | |
| " messages=conversation_history\n", | |
| " )\n", | |
| " r = c.choices[0].message.content\n", | |
| " print(\"Model said\", r)\n", | |
| " if r.strip() == \"True\":\n", | |
| " return True\n", | |
| " elif r.strip() == \"False\":\n", | |
| " return False\n", | |
| " else:\n", | |
| " raise ValueError(f\"LLM did not return True or False, got '{r}'\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Iod4CNfTtCrX", | |
| "outputId": "3241fc8e-48ae-4bb3-bbc9-42c61dd85ddc" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.0/55.0 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m450.4/450.4 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Test" | |
| ], | |
| "metadata": { | |
| "id": "h0GlzJ8ZyLYK" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Note: the functions return False for nonsense text and True for readable text.\n", | |
| "# Accordingly, the texts dictionary has False for nonsense text and True for readable text\n", | |
| "# i.e. \"Is nonsense? False.\" and \"Is nonsense? True.\"\n", | |
| "texts = {\n", | |
| " 'DMEcalPreshowerDigis': True,\n", | |
| " 'f=lambda x:map(lambda x:x**2,[1,68,3]);f(3)': False,\n", | |
| " 'faiwtlwexu': True,\n", | |
| " 'asfgtqwafazfyiur': True,\n", | |
| " 'zxcvbnmlkjhgfdsaqwerty': True,\n", | |
| " 'this is a sentence.': False,\n", | |
| " 'print(\"hello world\")': False,\n", | |
| " 'Txgf lsnv uhz bhwgl ejmw fxhj yz': True,\n", | |
| " \"Mjqsfnso btypf qsbhfu xrzfgju hsp!\": True,\n", | |
| " '''\n", | |
| "m = re.findall(\n", | |
| " rf\"<{t}.*?>(.*?)</{t}>\",\n", | |
| " c,\n", | |
| " re.DOTALL\n", | |
| ")\n", | |
| "o[t].extend(m)\n", | |
| " ''': False,\n", | |
| " '''\n", | |
| "from MyOtherOtherClass import MyOtherOtherClass\n", | |
| "from typing import Any, Iterable, Optional\n", | |
| "\n", | |
| "class MyOtherClass(MyOtherOtherClass):\n", | |
| " \"\"\"My Other Class\"\"\"\n", | |
| " def __init__(self, x: Optional[Iterable] = None) -> None:\n", | |
| " super().__init__(x)\n", | |
| " self._y = 0\n", | |
| "\n", | |
| " def __getitem__(self, index: int) -> Any:\n", | |
| " return super().__getitem__(index)\n", | |
| "\n", | |
| "class MyClass():\n", | |
| " \"\"\"My Class\"\"\"\n", | |
| " def __init__():\n", | |
| " ...\n", | |
| "\n", | |
| " def __str__():\n", | |
| " ...\n", | |
| "\n", | |
| " def __repr__():\n", | |
| " ...\n", | |
| " ''': False,\n", | |
| "}\n", | |
| "methods = [\n", | |
| " frequencies_is_nonsense,\n", | |
| " words_is_nonsense,\n", | |
| " nostril_is_nonsense,\n", | |
| " llm_is_nonsense\n", | |
| "]\n", | |
| "scores = {method.__name__: 0 for method in methods}\n", | |
| "\n", | |
| "# Process texts\n", | |
| "for text in texts:\n", | |
| " print(f\"Testing '{text[:25].strip()}'\")\n", | |
| " results = {method.__name__: method(text) for method in methods}\n", | |
| " print(\"-----\")\n", | |
| " for name, result in results.items():\n", | |
| " if result == texts[text]:\n", | |
| " scores[name] += 1\n", | |
| " print(f\"{name}: {'NONSENSE' if result else 'REAL'}\")\n", | |
| " print(\"-----\\n\")" | |
| ], | |
| "metadata": { | |
| "id": "96koOiuQN8Vo", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "a8926304-62f9-490f-da04-c75ae1bcd8e6" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Testing 'DMEcalPreshowerDigis'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.13%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'f=lambda x:map(lambda x:x'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.58%\n", | |
| "% validity of tokens compared to words list: 40.00%\n", | |
| "Model said False\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: REAL\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: REAL\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'faiwtlwexu'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.32%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: NONSENSE\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'asfgtqwafazfyiur'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.29%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: NONSENSE\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'zxcvbnmlkjhgfdsaqwerty'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.03%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: NONSENSE\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'this is a sentence.'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 0.88%\n", | |
| "% validity of tokens compared to words list: 80.00%\n", | |
| "Model said False\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: REAL\n", | |
| "words_is_nonsense: REAL\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: REAL\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'print(\"hello world\")'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.07%\n", | |
| "% validity of tokens compared to words list: 25.00%\n", | |
| "Model said False\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: REAL\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: REAL\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'Txgf lsnv uhz bhwgl ejmw'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.05%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: NONSENSE\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'Mjqsfnso btypf qsbhfu xrz'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.18%\n", | |
| "% validity of tokens compared to words list: 0.00%\n", | |
| "Model said True\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: NONSENSE\n", | |
| "llm_is_nonsense: NONSENSE\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'm = re.findall(\n", | |
| " rf\"<{t'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 1.01%\n", | |
| "% validity of tokens compared to words list: 12.50%\n", | |
| "Model said False\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: NONSENSE\n", | |
| "words_is_nonsense: NONSENSE\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: REAL\n", | |
| "-----\n", | |
| "\n", | |
| "Testing 'from MyOtherOtherClass i'\n", | |
| "Avg % deviation of # character apperances compared to frequencies: 0.60%\n", | |
| "% validity of tokens compared to words list: 17.80%\n", | |
| "Model said False\n", | |
| "-----\n", | |
| "frequencies_is_nonsense: REAL\n", | |
| "words_is_nonsense: REAL\n", | |
| "nonsense_detector: REAL\n", | |
| "llm_is_nonsense: REAL\n", | |
| "-----\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "print(\"Final scores:\")\n", | |
| "scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))\n", | |
| "for i, (method, score) in enumerate(scores.items()):\n", | |
| " print(f\"{i + 1}. {method}: {score}\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "ArDs3SstBdKg", | |
| "outputId": "cd1e2936-b1b9-40de-bb4c-55a28bdb1075" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Final scores:\n", | |
| "1. llm_is_nonsense: 11\n", | |
| "2. words_is_nonsense: 10\n", | |
| "3. nonsense_detector: 10\n", | |
| "4. frequencies_is_nonsense: 8\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment