Created
June 10, 2025 07:22
-
-
Save n7eonard/b8756518baba30ff215557f1080daf8a to your computer and use it in GitHub Desktop.
smol_ai_quiz_dataset.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyNjtKMS8dzijqWJJKz8Vw5q", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/n7eonard/b8756518baba30ff215557f1080daf8a/smol_ai_quiz_dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "1WhVkPKMgxkh", | |
| "outputId": "5bb0f726-d20d-46a2-b9be-9a579c9d3879" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: feedparser in /usr/local/lib/python3.11/dist-packages (6.0.11)\n", | |
| "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n", | |
| "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (4.13.4)\n", | |
| "Requirement already satisfied: markdownify in /usr/local/lib/python3.11/dist-packages (1.1.0)\n", | |
| "Requirement already satisfied: sgmllib3k in /usr/local/lib/python3.11/dist-packages (from feedparser) (1.0.0)\n", | |
| "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.2)\n", | |
| "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n", | |
| "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests) (2.4.0)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2025.4.26)\n", | |
| "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4) (2.7)\n", | |
| "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4) (4.14.0)\n", | |
| "Requirement already satisfied: six<2,>=1.15 in /usr/local/lib/python3.11/dist-packages (from markdownify) (1.17.0)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip install feedparser requests beautifulsoup4 markdownify" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import feedparser\n", | |
| "import requests\n", | |
| "from bs4 import BeautifulSoup\n", | |
| "from markdownify import markdownify as md\n", | |
| "import json\n", | |
| "import os\n", | |
| "from datetime import datetime\n", | |
| "\n", | |
| "# Configuration\n", | |
| "RSS_URL = \"https://news.smol.ai/rss.xml\"\n", | |
| "TAG_LIST = [\"chatgpt\", \"hardware\", \"benchmarks\", \"model releases\", \"bugs\", \"community\"]\n", | |
| "OUTPUT_DIR = \"smol_issues\"\n", | |
| "os.makedirs(OUTPUT_DIR, exist_ok=True)" | |
| ], | |
| "metadata": { | |
| "id": "tYae5IoAg8k6" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "feed = feedparser.parse(RSS_URL)\n", | |
| "print(f\"✅ Found {len(feed.entries)} issues.\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "amiTnqP9iXj7", | |
| "outputId": "539215df-d1ef-4ff1-e983-bd2fa5046104" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "✅ Found 398 issues.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "issues_quiz_data = []\n", | |
| "\n", | |
| "for entry in feed.entries:\n", | |
| " title = entry.title\n", | |
| " date = entry.published\n", | |
| " link = entry.link\n", | |
| "\n", | |
| " # Fetch and parse full issue content\n", | |
| " resp = requests.get(link)\n", | |
| " soup = BeautifulSoup(resp.content, \"html.parser\")\n", | |
| " article = soup.find(\"article\") or soup.find(\"main\")\n", | |
| "\n", | |
| " if not article:\n", | |
| " continue\n", | |
| "\n", | |
| " # Convert to markdown, extract bullet points\n", | |
| " md_text = md(str(article))\n", | |
| " lines = [l.strip() for l in md_text.splitlines() if l.strip().startswith((\"-\", \"*\"))]\n", | |
| "\n", | |
| " categorized = {tag: [] for tag in TAG_LIST}\n", | |
| " categorized[\"others\"] = []\n", | |
| "\n", | |
| " for line in lines:\n", | |
| " text = line.lstrip(\"-* \").strip()\n", | |
| " lower = text.lower()\n", | |
| " matched = False\n", | |
| " for tag in TAG_LIST:\n", | |
| " if tag in lower:\n", | |
| " categorized[tag].append(text)\n", | |
| " matched = True\n", | |
| " if not matched:\n", | |
| " categorized[\"others\"].append(text)\n", | |
| "\n", | |
| " quiz_entry = {\n", | |
| " \"title\": title,\n", | |
| " \"date\": date,\n", | |
| " \"link\": link,\n", | |
| " \"categories\": categorized,\n", | |
| " \"tags\": [tag for tag, bullets in categorized.items() if bullets and tag != \"others\"],\n", | |
| " }\n", | |
| "\n", | |
| " issues_quiz_data.append(quiz_entry)\n", | |
| "\n", | |
| "print(f\"✅ Parsed and categorized {len(issues_quiz_data)} issues.\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "duo79gN1iZm6", | |
| "outputId": "57c11524-d428-4697-da85-ec7bcb448c31" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "✅ Parsed and categorized 398 issues.\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import json\n", | |
| "import os\n", | |
| "\n", | |
| "# Define output directory and file name\n", | |
| "output_dir = \"smol_issues\"\n", | |
| "os.makedirs(output_dir, exist_ok=True)\n", | |
| "\n", | |
| "output_path = os.path.join(output_dir, \"smol_issues_quiz.json\")" | |
| ], | |
| "metadata": { | |
| "id": "Q5bbYRpZib6Q" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "with open(output_path, \"w\", encoding=\"utf-8\") as f:\n", | |
| " json.dump(issues_quiz_data, f, indent=2, ensure_ascii=False)\n", | |
| "\n", | |
| "print(f\"✅ JSON saved to: {output_path}\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "WHRl6Qttirft", | |
| "outputId": "a24b1253-93fd-4aa1-e81b-cd883bc7e8ee" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "✅ JSON saved to: smol_issues/smol_issues_quiz.json\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from google.colab import files\n", | |
| "files.download(output_path)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 17 | |
| }, | |
| "id": "FE7NeKYarvmO", | |
| "outputId": "e835592f-ab7a-48e9-fb94-0ce033059f45" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "\n", | |
| " async function download(id, filename, size) {\n", | |
| " if (!google.colab.kernel.accessAllowed) {\n", | |
| " return;\n", | |
| " }\n", | |
| " const div = document.createElement('div');\n", | |
| " const label = document.createElement('label');\n", | |
| " label.textContent = `Downloading \"${filename}\": `;\n", | |
| " div.appendChild(label);\n", | |
| " const progress = document.createElement('progress');\n", | |
| " progress.max = size;\n", | |
| " div.appendChild(progress);\n", | |
| " document.body.appendChild(div);\n", | |
| "\n", | |
| " const buffers = [];\n", | |
| " let downloaded = 0;\n", | |
| "\n", | |
| " const channel = await google.colab.kernel.comms.open(id);\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| "\n", | |
| " for await (const message of channel.messages) {\n", | |
| " // Send a message to notify the kernel that we're ready.\n", | |
| " channel.send({})\n", | |
| " if (message.buffers) {\n", | |
| " for (const buffer of message.buffers) {\n", | |
| " buffers.push(buffer);\n", | |
| " downloaded += buffer.byteLength;\n", | |
| " progress.value = downloaded;\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
| " const a = document.createElement('a');\n", | |
| " a.href = window.URL.createObjectURL(blob);\n", | |
| " a.download = filename;\n", | |
| " div.appendChild(a);\n", | |
| " a.click();\n", | |
| " div.remove();\n", | |
| " }\n", | |
| " " | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<IPython.core.display.Javascript object>" | |
| ], | |
| "application/javascript": [ | |
| "download(\"download_4dc218db-d403-4c3b-8d77-0176a62bde3e\", \"smol_issues_quiz.json\", 66194963)" | |
| ] | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment