Skip to content

Instantly share code, notes, and snippets.

@n7eonard
Created June 10, 2025 07:22
Show Gist options
  • Select an option

  • Save n7eonard/b8756518baba30ff215557f1080daf8a to your computer and use it in GitHub Desktop.

Select an option

Save n7eonard/b8756518baba30ff215557f1080daf8a to your computer and use it in GitHub Desktop.
smol_ai_quiz_dataset.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyNjtKMS8dzijqWJJKz8Vw5q",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/n7eonard/b8756518baba30ff215557f1080daf8a/smol_ai_quiz_dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1WhVkPKMgxkh",
"outputId": "5bb0f726-d20d-46a2-b9be-9a579c9d3879"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: feedparser in /usr/local/lib/python3.11/dist-packages (6.0.11)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
"Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.11/dist-packages (4.13.4)\n",
"Requirement already satisfied: markdownify in /usr/local/lib/python3.11/dist-packages (1.1.0)\n",
"Requirement already satisfied: sgmllib3k in /usr/local/lib/python3.11/dist-packages (from feedparser) (1.0.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests) (2.4.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2025.4.26)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4) (2.7)\n",
"Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.11/dist-packages (from beautifulsoup4) (4.14.0)\n",
"Requirement already satisfied: six<2,>=1.15 in /usr/local/lib/python3.11/dist-packages (from markdownify) (1.17.0)\n"
]
}
],
"source": [
"!pip install feedparser requests beautifulsoup4 markdownify"
]
},
{
"cell_type": "code",
"source": [
"import feedparser\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from markdownify import markdownify as md\n",
"import json\n",
"import os\n",
"from datetime import datetime\n",
"\n",
"# Configuration\n",
"RSS_URL = \"https://news.smol.ai/rss.xml\"\n",
"TAG_LIST = [\"chatgpt\", \"hardware\", \"benchmarks\", \"model releases\", \"bugs\", \"community\"]\n",
"OUTPUT_DIR = \"smol_issues\"\n",
"os.makedirs(OUTPUT_DIR, exist_ok=True)"
],
"metadata": {
"id": "tYae5IoAg8k6"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"feed = feedparser.parse(RSS_URL)\n",
"print(f\"✅ Found {len(feed.entries)} issues.\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "amiTnqP9iXj7",
"outputId": "539215df-d1ef-4ff1-e983-bd2fa5046104"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Found 398 issues.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"issues_quiz_data = []\n",
"\n",
"for entry in feed.entries:\n",
" title = entry.title\n",
" date = entry.published\n",
" link = entry.link\n",
"\n",
" # Fetch and parse full issue content\n",
" resp = requests.get(link)\n",
" soup = BeautifulSoup(resp.content, \"html.parser\")\n",
" article = soup.find(\"article\") or soup.find(\"main\")\n",
"\n",
" if not article:\n",
" continue\n",
"\n",
" # Convert to markdown, extract bullet points\n",
" md_text = md(str(article))\n",
" lines = [l.strip() for l in md_text.splitlines() if l.strip().startswith((\"-\", \"*\"))]\n",
"\n",
" categorized = {tag: [] for tag in TAG_LIST}\n",
" categorized[\"others\"] = []\n",
"\n",
" for line in lines:\n",
" text = line.lstrip(\"-* \").strip()\n",
" lower = text.lower()\n",
" matched = False\n",
" for tag in TAG_LIST:\n",
" if tag in lower:\n",
" categorized[tag].append(text)\n",
" matched = True\n",
" if not matched:\n",
" categorized[\"others\"].append(text)\n",
"\n",
" quiz_entry = {\n",
" \"title\": title,\n",
" \"date\": date,\n",
" \"link\": link,\n",
" \"categories\": categorized,\n",
" \"tags\": [tag for tag, bullets in categorized.items() if bullets and tag != \"others\"],\n",
" }\n",
"\n",
" issues_quiz_data.append(quiz_entry)\n",
"\n",
"print(f\"✅ Parsed and categorized {len(issues_quiz_data)} issues.\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "duo79gN1iZm6",
"outputId": "57c11524-d428-4697-da85-ec7bcb448c31"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ Parsed and categorized 398 issues.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import json\n",
"import os\n",
"\n",
"# Define output directory and file name\n",
"output_dir = \"smol_issues\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"output_path = os.path.join(output_dir, \"smol_issues_quiz.json\")"
],
"metadata": {
"id": "Q5bbYRpZib6Q"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"with open(output_path, \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(issues_quiz_data, f, indent=2, ensure_ascii=False)\n",
"\n",
"print(f\"✅ JSON saved to: {output_path}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WHRl6Qttirft",
"outputId": "a24b1253-93fd-4aa1-e81b-cd883bc7e8ee"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"✅ JSON saved to: smol_issues/smol_issues_quiz.json\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from google.colab import files\n",
"files.download(output_path)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"id": "FE7NeKYarvmO",
"outputId": "e835592f-ab7a-48e9-fb94-0ce033059f45"
},
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"\n",
" async function download(id, filename, size) {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" const div = document.createElement('div');\n",
" const label = document.createElement('label');\n",
" label.textContent = `Downloading \"${filename}\": `;\n",
" div.appendChild(label);\n",
" const progress = document.createElement('progress');\n",
" progress.max = size;\n",
" div.appendChild(progress);\n",
" document.body.appendChild(div);\n",
"\n",
" const buffers = [];\n",
" let downloaded = 0;\n",
"\n",
" const channel = await google.colab.kernel.comms.open(id);\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
"\n",
" for await (const message of channel.messages) {\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
" if (message.buffers) {\n",
" for (const buffer of message.buffers) {\n",
" buffers.push(buffer);\n",
" downloaded += buffer.byteLength;\n",
" progress.value = downloaded;\n",
" }\n",
" }\n",
" }\n",
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
" const a = document.createElement('a');\n",
" a.href = window.URL.createObjectURL(blob);\n",
" a.download = filename;\n",
" div.appendChild(a);\n",
" a.click();\n",
" div.remove();\n",
" }\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"download(\"download_4dc218db-d403-4c3b-8d77-0176a62bde3e\", \"smol_issues_quiz.json\", 66194963)"
]
},
"metadata": {}
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment