xarical · May 1, 2025 18:16
diff --git a/automatic-detection-of-readable-text.ipynb b/automatic-detection-of-readable-text.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyMcFk8RyVinIsjebvovpMu9",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/xarical/47a4fa3dc62fa57e1d0558a058900e3c/automatic-detection-of-readable-text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Testing various methods of automatically detecting readable text"
      ],
      "metadata": {
        "id": "HZEdyinHx1Ns"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Using a frequency table"
      ],
      "metadata": {
        "id": "1wtQnK4RrsxR"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Frequency table (absolute frequency)\n",
        "frequency_table = {\n",
        "    \" \": 275, \"!\": 0, '\"': 2, \"#\": 0, \"$\": 0, \"%\": 0, \"&\": 0, \"'\": 2, \"(\": 0,\n",
        "    \")\": 0, \"*\": 0, \"+\": 0, \",\": 17, \"-\": 0, \".\": 24, \"/\": 0, \"0\": 2, \"1\": 2,\n",
        "    \"2\": 0, \"3\": 1, \"4\": 0, \"5\": 1, \"6\": 1, \"7\": 1, \"8\": 0, \"9\": 2, \":\": 0,\n",
        "    \";\": 0, \"<\": 0, \"=\": 0, \">\": 0, \"?\": 0, \"@\": 0, \"A\": 2, \"B\": 2, \"C\": 2,\n",
        "    \"D\": 0, \"E\": 2, \"F\": 3, \"G\": 2, \"H\": 3, \"I\": 12, \"J\": 2, \"K\": 0, \"L\": 0,\n",
        "    \"M\": 1, \"N\": 1, \"O\": 1, \"P\": 1, \"Q\": 0, \"R\": 0, \"S\": 9, \"T\": 6, \"U\": 0,\n",
        "    \"V\": 0, \"W\": 3, \"X\": 0, \"Y\": 0, \"Z\": 0, \"[\": 0, \"\\\\\": 0, \"]\": 0, \"^\": 0,\n",
        "    \"_\": 0, \"`\": 0, \"a\": 100, \"b\": 15, \"c\": 22, \"d\": 37, \"e\": 123, \"f\": 31,\n",
        "    \"g\": 30, \"h\": 59, \"i\": 71, \"j\": 0, \"k\": 7, \"l\": 48, \"m\": 23, \"n\": 66,\n",
        "    \"o\": 101, \"p\": 14, \"q\": 0, \"r\": 68, \"s\": 74, \"t\": 105, \"u\": 36, \"v\": 10,\n",
        "    \"w\": 34, \"x\": 0, \"y\": 32, \"z\": 1, \"{\": 0, \"|\": 0, \"}\": 0, \"~\": 0, \"\\x7f\": 0\n",
        "}\n",
        "\n",
        "# Threshold of max avg deviation of percentage for frequency_nonsense\n",
        "DEVIATION_THRESHOLD = 1\n",
        "\n",
        "def calculate_relative_frequencies(text: str) -> dict[str, float]:\n",
        "    \"\"\"Calculate the relative frequencies of the characters in the text.\"\"\"\n",
        "    # Collect absolute frequencies\n",
        "    absolute_frequencies = {}\n",
        "    for char in text:\n",
        "        if char in absolute_frequencies:\n",
        "            absolute_frequencies[char] += 1\n",
        "        else:\n",
        "            absolute_frequencies[char] = 1\n",
        "\n",
        "    # Convert to relative frequencies\n",
        "    relative_frequencies = {\n",
        "        char: freq / len(text)\n",
        "        for char, freq in absolute_frequencies.items()\n",
        "    }\n",
        "    return relative_frequencies\n",
        "\n",
        "def frequencies_is_nonsense(text: str) -> bool:\n",
        "    \"\"\"\n",
        "    Determine whether or not a given text is nonsense, using a frequency table.\n",
        "    \"\"\"\n",
        "    text_relative_frequencies = calculate_relative_frequencies(text)\n",
        "    frequency_table_sum = sum(frequency_table.values()) # calculate outside loop\n",
        "    expected_relative_frequencies = {\n",
        "        char: freq / frequency_table_sum\n",
        "        for char, freq in frequency_table.items()\n",
        "    }\n",
        "\n",
        "    # Collect deviations of percentages\n",
        "    deviations = [\n",
        "        abs(text_relative_frequencies.get(char, 0) - expected_freq)\n",
        "        for char, expected_freq in expected_relative_frequencies.items()\n",
        "    ]\n",
        "\n",
        "    # Average deviations of percentages\n",
        "    avg_deviation = (sum(deviations) / len(deviations)) * 100\n",
        "    print(f\"Avg % deviation of # character apperances compared to frequencies: {avg_deviation:.2f}%\")\n",
        "    return avg_deviation > DEVIATION_THRESHOLD"
      ],
      "metadata": {
        "id": "LASYvPIuN-f-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Using spaCy word detection"
      ],
      "metadata": {
        "id": "7xFNHxHcrxMb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -q nltk spacy\n",
        "import nltk\n",
        "import spacy\n",
        "\n",
        "# Setup\n",
        "nltk.download(\"words\", download_dir=\".\") # download words list in the local dir\n",
        "nltk.data.path.append(\".\") # set resource lookup path\n",
        "words = set(nltk.corpus.words.words()) # load words list as a set for speed\n",
        "nlp = spacy.load(\"en_core_web_sm\") # load spaCy NLP model\n",
        "\n",
        "# Threshold of min percentage valid words for words_is_nonsense\n",
        "VALIDITY_THRESHOLD = 15\n",
        "\n",
        "def words_is_nonsense(text):\n",
        "    # Tokenize the text\n",
        "    tokens = nlp(text)\n",
        "    num_valid_words = sum(\n",
        "        1 for token in tokens if token.lemma_.lower() in words or token.lemma_ in words\n",
        "    )\n",
        "    validity_percentage = (num_valid_words / len(tokens)) * 100\n",
        "    print(f\"% validity of tokens compared to words list: {validity_percentage:.2f}%\")\n",
        "    return validity_percentage < VALIDITY_THRESHOLD"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "U0GIhgwRdtIU",
        "outputId": "61a83d70-a4df-4b79-d19c-6a2766a303a3"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package words to ....\n",
            "[nltk_data]   Unzipping corpora/words.zip.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Using Nostril nonsense string evaluator"
      ],
      "metadata": {
        "id": "UYwxMfuhrGT4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -q git+https://github.com/casics/nostril.git\n",
        "from nostril import nonsense as nostril_is_nonsense"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MSNLwK0ksPYy",
        "outputId": "1040291f-c36d-4419-a3db-93763253e5bb"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "  Building wheel for nostril (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Using an LLM (GPT-4o-mini, served through G4F)"
      ],
      "metadata": {
        "id": "7MuaeALRsw06"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip -q install g4f\n",
        "from g4f.client import Client\n",
        "\n",
        "client = Client()\n",
        "\n",
        "system_prompt = \"\"\"\\\n",
        "Determine whether or not a given text is nonsense.\n",
        "Respond with True if the text is nonsense and False if the text is readable, human-understandable text.\n",
        "Respond with True or False and nothing else.\n",
        "\"\"\"\n",
        "\n",
        "def llm_is_nonsense(text):\n",
        "  conversation_history = [\n",
        "      {\"role\": \"system\", \"content\": system_prompt},\n",
        "      {\"role\": \"user\", \"content\": text}\n",
        "  ]\n",
        "  c = client.chat.completions.create(\n",
        "    model=\"gpt-4o-mini\",\n",
        "    temperature=0,\n",
        "    messages=conversation_history\n",
        "  )\n",
        "  r = c.choices[0].message.content\n",
        "  print(\"Model said\", r)\n",
        "  if r.strip() == \"True\":\n",
        "    return True\n",
        "  elif r.strip() == \"False\":\n",
        "    return False\n",
        "  else:\n",
        "    raise ValueError(f\"LLM did not return True or False, got '{r}'\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Iod4CNfTtCrX",
        "outputId": "3241fc8e-48ae-4bb3-bbc9-42c61dd85ddc"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.0/55.0 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m450.4/450.4 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Test"
      ],
      "metadata": {
        "id": "h0GlzJ8ZyLYK"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Note: the functions return False for nonsense text and True for readable text.\n",
        "# Accordingly, the texts dictionary has False for nonsense text and True for readable text\n",
        "# i.e. \"Is nonsense? False.\" and \"Is nonsense? True.\"\n",
        "texts = {\n",
        "    'DMEcalPreshowerDigis': True,\n",
        "    'f=lambda x:map(lambda x:x**2,[1,68,3]);f(3)': False,\n",
        "    'faiwtlwexu': True,\n",
        "    'asfgtqwafazfyiur': True,\n",
        "    'zxcvbnmlkjhgfdsaqwerty': True,\n",
        "    'this is a sentence.': False,\n",
        "    'print(\"hello world\")': False,\n",
        "    'Txgf lsnv uhz bhwgl ejmw fxhj yz': True,\n",
        "    \"Mjqsfnso btypf qsbhfu xrzfgju hsp!\": True,\n",
        "    '''\n",
        "m = re.findall(\n",
        "  rf\"<{t}.*?>(.*?)</{t}>\",\n",
        "  c,\n",
        "  re.DOTALL\n",
        ")\n",
        "o[t].extend(m)\n",
        "    ''': False,\n",
        "    '''\n",
        "from MyOtherOtherClass import MyOtherOtherClass\n",
        "from typing import Any, Iterable, Optional\n",
        "\n",
        "class MyOtherClass(MyOtherOtherClass):\n",
        "  \"\"\"My Other Class\"\"\"\n",
        "  def __init__(self, x: Optional[Iterable] = None) -> None:\n",
        "    super().__init__(x)\n",
        "    self._y = 0\n",
        "\n",
        "  def __getitem__(self, index: int) -> Any:\n",
        "    return super().__getitem__(index)\n",
        "\n",
        "class MyClass():\n",
        "  \"\"\"My Class\"\"\"\n",
        "  def __init__():\n",
        "      ...\n",
        "\n",
        "  def __str__():\n",
        "      ...\n",
        "\n",
        "  def __repr__():\n",
        "    ...\n",
        "    ''': False,\n",
        "}\n",
        "methods = [\n",
        "  frequencies_is_nonsense,\n",
        "  words_is_nonsense,\n",
        "  nostril_is_nonsense,\n",
        "  llm_is_nonsense\n",
        "]\n",
        "scores = {method.__name__: 0 for method in methods}\n",
        "\n",
        "# Process texts\n",
        "for text in texts:\n",
        "  print(f\"Testing '{text[:25].strip()}'\")\n",
        "  results = {method.__name__: method(text) for method in methods}\n",
        "  print(\"-----\")\n",
        "  for name, result in results.items():\n",
        "    if result == texts[text]:\n",
        "      scores[name] += 1\n",
        "    print(f\"{name}: {'NONSENSE' if result else 'REAL'}\")\n",
        "  print(\"-----\\n\")"
      ],
      "metadata": {
        "id": "96koOiuQN8Vo",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "a8926304-62f9-490f-da04-c75ae1bcd8e6"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Testing 'DMEcalPreshowerDigis'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.13%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'f=lambda x:map(lambda x:x'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.58%\n",
            "% validity of tokens compared to words list: 40.00%\n",
            "Model said False\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: REAL\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: REAL\n",
            "-----\n",
            "\n",
            "Testing 'faiwtlwexu'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.32%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: NONSENSE\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'asfgtqwafazfyiur'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.29%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: NONSENSE\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'zxcvbnmlkjhgfdsaqwerty'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.03%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: NONSENSE\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'this is a sentence.'\n",
            "Avg % deviation of # character apperances compared to frequencies: 0.88%\n",
            "% validity of tokens compared to words list: 80.00%\n",
            "Model said False\n",
            "-----\n",
            "frequencies_is_nonsense: REAL\n",
            "words_is_nonsense: REAL\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: REAL\n",
            "-----\n",
            "\n",
            "Testing 'print(\"hello world\")'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.07%\n",
            "% validity of tokens compared to words list: 25.00%\n",
            "Model said False\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: REAL\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: REAL\n",
            "-----\n",
            "\n",
            "Testing 'Txgf lsnv uhz bhwgl ejmw'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.05%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: NONSENSE\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'Mjqsfnso btypf qsbhfu xrz'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.18%\n",
            "% validity of tokens compared to words list: 0.00%\n",
            "Model said True\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: NONSENSE\n",
            "llm_is_nonsense: NONSENSE\n",
            "-----\n",
            "\n",
            "Testing 'm = re.findall(\n",
            "  rf\"<{t'\n",
            "Avg % deviation of # character apperances compared to frequencies: 1.01%\n",
            "% validity of tokens compared to words list: 12.50%\n",
            "Model said False\n",
            "-----\n",
            "frequencies_is_nonsense: NONSENSE\n",
            "words_is_nonsense: NONSENSE\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: REAL\n",
            "-----\n",
            "\n",
            "Testing 'from MyOtherOtherClass i'\n",
            "Avg % deviation of # character apperances compared to frequencies: 0.60%\n",
            "% validity of tokens compared to words list: 17.80%\n",
            "Model said False\n",
            "-----\n",
            "frequencies_is_nonsense: REAL\n",
            "words_is_nonsense: REAL\n",
            "nonsense_detector: REAL\n",
            "llm_is_nonsense: REAL\n",
            "-----\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"Final scores:\")\n",
        "scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))\n",
        "for i, (method, score) in enumerate(scores.items()):\n",
        "    print(f\"{i + 1}. {method}: {score}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ArDs3SstBdKg",
        "outputId": "cd1e2936-b1b9-40de-bb4c-55a28bdb1075"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Final scores:\n",
            "1. llm_is_nonsense: 11\n",
            "2. words_is_nonsense: 10\n",
            "3. nonsense_detector: 10\n",
            "4. frequencies_is_nonsense: 8\n"
          ]
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"authorship_tag": "ABX9TyMcFk8RyVinIsjebvovpMu9",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/xarical/47a4fa3dc62fa57e1d0558a058900e3c/automatic-detection-of-readable-text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Testing various methods of automatically detecting readable text"
	],
	"metadata": {
	"id": "HZEdyinHx1Ns"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"Using a frequency table"
	],
	"metadata": {
	"id": "1wtQnK4RrsxR"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Frequency table (absolute frequency)\n",
	"frequency_table = {\n",
	" \" \": 275, \"!\": 0, '\"': 2, \"#\": 0, \"$\": 0, \"%\": 0, \"&\": 0, \"'\": 2, \"(\": 0,\n",
	" \")\": 0, \"*\": 0, \"+\": 0, \",\": 17, \"-\": 0, \".\": 24, \"/\": 0, \"0\": 2, \"1\": 2,\n",
	" \"2\": 0, \"3\": 1, \"4\": 0, \"5\": 1, \"6\": 1, \"7\": 1, \"8\": 0, \"9\": 2, \":\": 0,\n",
	" \";\": 0, \"<\": 0, \"=\": 0, \">\": 0, \"?\": 0, \"@\": 0, \"A\": 2, \"B\": 2, \"C\": 2,\n",
	" \"D\": 0, \"E\": 2, \"F\": 3, \"G\": 2, \"H\": 3, \"I\": 12, \"J\": 2, \"K\": 0, \"L\": 0,\n",
	" \"M\": 1, \"N\": 1, \"O\": 1, \"P\": 1, \"Q\": 0, \"R\": 0, \"S\": 9, \"T\": 6, \"U\": 0,\n",
	" \"V\": 0, \"W\": 3, \"X\": 0, \"Y\": 0, \"Z\": 0, \"[\": 0, \"\\\\\": 0, \"]\": 0, \"^\": 0,\n",
	" \"_\": 0, \"`\": 0, \"a\": 100, \"b\": 15, \"c\": 22, \"d\": 37, \"e\": 123, \"f\": 31,\n",
	" \"g\": 30, \"h\": 59, \"i\": 71, \"j\": 0, \"k\": 7, \"l\": 48, \"m\": 23, \"n\": 66,\n",
	" \"o\": 101, \"p\": 14, \"q\": 0, \"r\": 68, \"s\": 74, \"t\": 105, \"u\": 36, \"v\": 10,\n",
	" \"w\": 34, \"x\": 0, \"y\": 32, \"z\": 1, \"{\": 0, \"\|\": 0, \"}\": 0, \"~\": 0, \"\\x7f\": 0\n",
	"}\n",
	"\n",
	"# Threshold of max avg deviation of percentage for frequency_nonsense\n",
	"DEVIATION_THRESHOLD = 1\n",
	"\n",
	"def calculate_relative_frequencies(text: str) -> dict[str, float]:\n",
	" \"\"\"Calculate the relative frequencies of the characters in the text.\"\"\"\n",
	" # Collect absolute frequencies\n",
	" absolute_frequencies = {}\n",
	" for char in text:\n",
	" if char in absolute_frequencies:\n",
	" absolute_frequencies[char] += 1\n",
	" else:\n",
	" absolute_frequencies[char] = 1\n",
	"\n",
	" # Convert to relative frequencies\n",
	" relative_frequencies = {\n",
	" char: freq / len(text)\n",
	" for char, freq in absolute_frequencies.items()\n",
	" }\n",
	" return relative_frequencies\n",
	"\n",
	"def frequencies_is_nonsense(text: str) -> bool:\n",
	" \"\"\"\n",
	" Determine whether or not a given text is nonsense, using a frequency table.\n",
	" \"\"\"\n",
	" text_relative_frequencies = calculate_relative_frequencies(text)\n",
	" frequency_table_sum = sum(frequency_table.values()) # calculate outside loop\n",
	" expected_relative_frequencies = {\n",
	" char: freq / frequency_table_sum\n",
	" for char, freq in frequency_table.items()\n",
	" }\n",
	"\n",
	" # Collect deviations of percentages\n",
	" deviations = [\n",
	" abs(text_relative_frequencies.get(char, 0) - expected_freq)\n",
	" for char, expected_freq in expected_relative_frequencies.items()\n",
	" ]\n",
	"\n",
	" # Average deviations of percentages\n",
	" avg_deviation = (sum(deviations) / len(deviations)) * 100\n",
	" print(f\"Avg % deviation of # character apperances compared to frequencies: {avg_deviation:.2f}%\")\n",
	" return avg_deviation > DEVIATION_THRESHOLD"
	],
	"metadata": {
	"id": "LASYvPIuN-f-"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"Using spaCy word detection"
	],
	"metadata": {
	"id": "7xFNHxHcrxMb"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install -q nltk spacy\n",
	"import nltk\n",
	"import spacy\n",
	"\n",
	"# Setup\n",
	"nltk.download(\"words\", download_dir=\".\") # download words list in the local dir\n",
	"nltk.data.path.append(\".\") # set resource lookup path\n",
	"words = set(nltk.corpus.words.words()) # load words list as a set for speed\n",
	"nlp = spacy.load(\"en_core_web_sm\") # load spaCy NLP model\n",
	"\n",
	"# Threshold of min percentage valid words for words_is_nonsense\n",
	"VALIDITY_THRESHOLD = 15\n",
	"\n",
	"def words_is_nonsense(text):\n",
	" # Tokenize the text\n",
	" tokens = nlp(text)\n",
	" num_valid_words = sum(\n",
	" 1 for token in tokens if token.lemma_.lower() in words or token.lemma_ in words\n",
	" )\n",
	" validity_percentage = (num_valid_words / len(tokens)) * 100\n",
	" print(f\"% validity of tokens compared to words list: {validity_percentage:.2f}%\")\n",
	" return validity_percentage < VALIDITY_THRESHOLD"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "U0GIhgwRdtIU",
	"outputId": "61a83d70-a4df-4b79-d19c-6a2766a303a3"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stderr",
	"text": [
	"[nltk_data] Downloading package words to ....\n",
	"[nltk_data] Unzipping corpora/words.zip.\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Using Nostril nonsense string evaluator"
	],
	"metadata": {
	"id": "UYwxMfuhrGT4"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install -q git+https://github.com/casics/nostril.git\n",
	"from nostril import nonsense as nostril_is_nonsense"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "MSNLwK0ksPYy",
	"outputId": "1040291f-c36d-4419-a3db-93763253e5bb"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
	" Building wheel for nostril (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Using an LLM (GPT-4o-mini, served through G4F)"
	],
	"metadata": {
	"id": "7MuaeALRsw06"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip -q install g4f\n",
	"from g4f.client import Client\n",
	"\n",
	"client = Client()\n",
	"\n",
	"system_prompt = \"\"\"\\\n",
	"Determine whether or not a given text is nonsense.\n",
	"Respond with True if the text is nonsense and False if the text is readable, human-understandable text.\n",
	"Respond with True or False and nothing else.\n",
	"\"\"\"\n",
	"\n",
	"def llm_is_nonsense(text):\n",
	" conversation_history = [\n",
	" {\"role\": \"system\", \"content\": system_prompt},\n",
	" {\"role\": \"user\", \"content\": text}\n",
	" ]\n",
	" c = client.chat.completions.create(\n",
	" model=\"gpt-4o-mini\",\n",
	" temperature=0,\n",
	" messages=conversation_history\n",
	" )\n",
	" r = c.choices[0].message.content\n",
	" print(\"Model said\", r)\n",
	" if r.strip() == \"True\":\n",
	" return True\n",
	" elif r.strip() == \"False\":\n",
	" return False\n",
	" else:\n",
	" raise ValueError(f\"LLM did not return True or False, got '{r}'\")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Iod4CNfTtCrX",
	"outputId": "3241fc8e-48ae-4bb3-bbc9-42c61dd85ddc"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.0/55.0 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m450.4/450.4 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.9/2.9 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m25.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Test"
	],
	"metadata": {
	"id": "h0GlzJ8ZyLYK"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# Note: the functions return False for nonsense text and True for readable text.\n",
	"# Accordingly, the texts dictionary has False for nonsense text and True for readable text\n",
	"# i.e. \"Is nonsense? False.\" and \"Is nonsense? True.\"\n",
	"texts = {\n",
	" 'DMEcalPreshowerDigis': True,\n",
	" 'f=lambda x:map(lambda x:x**2,[1,68,3]);f(3)': False,\n",
	" 'faiwtlwexu': True,\n",
	" 'asfgtqwafazfyiur': True,\n",
	" 'zxcvbnmlkjhgfdsaqwerty': True,\n",
	" 'this is a sentence.': False,\n",
	" 'print(\"hello world\")': False,\n",
	" 'Txgf lsnv uhz bhwgl ejmw fxhj yz': True,\n",
	" \"Mjqsfnso btypf qsbhfu xrzfgju hsp!\": True,\n",
	" '''\n",
	"m = re.findall(\n",
	" rf\"<{t}.?>(.?)</{t}>\",\n",
	" c,\n",
	" re.DOTALL\n",
	")\n",
	"o[t].extend(m)\n",
	" ''': False,\n",
	" '''\n",
	"from MyOtherOtherClass import MyOtherOtherClass\n",
	"from typing import Any, Iterable, Optional\n",
	"\n",
	"class MyOtherClass(MyOtherOtherClass):\n",
	" \"\"\"My Other Class\"\"\"\n",
	" def __init__(self, x: Optional[Iterable] = None) -> None:\n",
	" super().__init__(x)\n",
	" self._y = 0\n",
	"\n",
	" def __getitem__(self, index: int) -> Any:\n",
	" return super().__getitem__(index)\n",
	"\n",
	"class MyClass():\n",
	" \"\"\"My Class\"\"\"\n",
	" def __init__():\n",
	" ...\n",
	"\n",
	" def __str__():\n",
	" ...\n",
	"\n",
	" def __repr__():\n",
	" ...\n",
	" ''': False,\n",
	"}\n",
	"methods = [\n",
	" frequencies_is_nonsense,\n",
	" words_is_nonsense,\n",
	" nostril_is_nonsense,\n",
	" llm_is_nonsense\n",
	"]\n",
	"scores = {method.__name__: 0 for method in methods}\n",
	"\n",
	"# Process texts\n",
	"for text in texts:\n",
	" print(f\"Testing '{text[:25].strip()}'\")\n",
	" results = {method.__name__: method(text) for method in methods}\n",
	" print(\"-----\")\n",
	" for name, result in results.items():\n",
	" if result == texts[text]:\n",
	" scores[name] += 1\n",
	" print(f\"{name}: {'NONSENSE' if result else 'REAL'}\")\n",
	" print(\"-----\\n\")"
	],
	"metadata": {
	"id": "96koOiuQN8Vo",
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"outputId": "a8926304-62f9-490f-da04-c75ae1bcd8e6"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Testing 'DMEcalPreshowerDigis'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.13%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'f=lambda x:map(lambda x:x'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.58%\n",
	"% validity of tokens compared to words list: 40.00%\n",
	"Model said False\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: REAL\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: REAL\n",
	"-----\n",
	"\n",
	"Testing 'faiwtlwexu'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.32%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: NONSENSE\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'asfgtqwafazfyiur'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.29%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: NONSENSE\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'zxcvbnmlkjhgfdsaqwerty'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.03%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: NONSENSE\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'this is a sentence.'\n",
	"Avg % deviation of # character apperances compared to frequencies: 0.88%\n",
	"% validity of tokens compared to words list: 80.00%\n",
	"Model said False\n",
	"-----\n",
	"frequencies_is_nonsense: REAL\n",
	"words_is_nonsense: REAL\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: REAL\n",
	"-----\n",
	"\n",
	"Testing 'print(\"hello world\")'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.07%\n",
	"% validity of tokens compared to words list: 25.00%\n",
	"Model said False\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: REAL\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: REAL\n",
	"-----\n",
	"\n",
	"Testing 'Txgf lsnv uhz bhwgl ejmw'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.05%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: NONSENSE\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'Mjqsfnso btypf qsbhfu xrz'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.18%\n",
	"% validity of tokens compared to words list: 0.00%\n",
	"Model said True\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: NONSENSE\n",
	"llm_is_nonsense: NONSENSE\n",
	"-----\n",
	"\n",
	"Testing 'm = re.findall(\n",
	" rf\"<{t'\n",
	"Avg % deviation of # character apperances compared to frequencies: 1.01%\n",
	"% validity of tokens compared to words list: 12.50%\n",
	"Model said False\n",
	"-----\n",
	"frequencies_is_nonsense: NONSENSE\n",
	"words_is_nonsense: NONSENSE\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: REAL\n",
	"-----\n",
	"\n",
	"Testing 'from MyOtherOtherClass i'\n",
	"Avg % deviation of # character apperances compared to frequencies: 0.60%\n",
	"% validity of tokens compared to words list: 17.80%\n",
	"Model said False\n",
	"-----\n",
	"frequencies_is_nonsense: REAL\n",
	"words_is_nonsense: REAL\n",
	"nonsense_detector: REAL\n",
	"llm_is_nonsense: REAL\n",
	"-----\n",
	"\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"print(\"Final scores:\")\n",
	"scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))\n",
	"for i, (method, score) in enumerate(scores.items()):\n",
	" print(f\"{i + 1}. {method}: {score}\")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "ArDs3SstBdKg",
	"outputId": "cd1e2936-b1b9-40de-bb4c-55a28bdb1075"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Final scores:\n",
	"1. llm_is_nonsense: 11\n",
	"2. words_is_nonsense: 10\n",
	"3. nonsense_detector: 10\n",
	"4. frequencies_is_nonsense: 8\n"
	]
	}
	]
	}
	]
	}
No results found