Last active
December 26, 2021 09:56
-
-
Save vol1ura/722bbe3cd9dadfcaac9e778cd2055a70 to your computer and use it in GitHub Desktop.
clubmates.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "clubmates.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "authorship_tag": "ABX9TyMSbfzztz2IMxYL6dDXQJeG", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/vol1ura/722bbe3cd9dadfcaac9e778cd2055a70/clubmates.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "zRg0J0r6ztxV" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import random\n", | |
| "import re\n", | |
| "import requests\n", | |
| "import time\n", | |
| "from tqdm.notebook import tqdm\n", | |
| "\n", | |
| "pd.set_option('display.max_rows', None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!pip install random_user_agent" | |
| ], | |
| "metadata": { | |
| "id": "Z61g9j-HFebY", | |
| "outputId": "3c9ecf80-d679-4e10-b42c-67e33d268940", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| } | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: random_user_agent in /usr/local/lib/python3.7/dist-packages (1.0.1)\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from random_user_agent.user_agent import UserAgent\n", | |
| "\n", | |
| "user_agent_rotator = UserAgent()" | |
| ], | |
| "metadata": { | |
| "id": "bZnw6iEPFy36" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "club_id = 23212 # Wake&Run\n", | |
| "home_parkrun = 'Kuzminki'" | |
| ], | |
| "metadata": { | |
| "id": "clo0v2xzzyLU" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "headers = {\n", | |
| " 'Host': 'www.parkrun.ru',\n", | |
| " 'User-Agent': user_agent_rotator.get_random_user_agent(),\n", | |
| " 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',\n", | |
| " 'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3',\n", | |
| " 'Accept-Encoding': 'gzip, deflate, br',\n", | |
| " 'Connection': 'keep-alive',\n", | |
| " 'Upgrade-Insecure-Requests': '1',\n", | |
| " 'Sec-GPC': '1',\n", | |
| " 'TE': 'Trailers'\n", | |
| " }" | |
| ], | |
| "metadata": { | |
| "id": "JMntqd19z4bw" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "page_all_results = requests.get('https://www.parkrun.ru/results/courserecords/', headers=headers)\n", | |
| "data = pd.read_html(page_all_results.text)[0]\n", | |
| "russian_parkruns = data[data.columns[0]]" | |
| ], | |
| "metadata": { | |
| "id": "ctIMuJuVz7T9", | |
| "outputId": "29d2ffd0-250d-40e1-a1f9-a2edb2e98511", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 348 | |
| } | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "error", | |
| "ename": "ValueError", | |
| "evalue": "ignored", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-58-7fb0c7ed9957>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mpage_all_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'https://www.parkrun.ru/results/courserecords/'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_html\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpage_all_results\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mrussian_parkruns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/util/_decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 294\u001b[0m )\n\u001b[1;32m 295\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mFutureWarning\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstacklevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstacklevel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mread_html\u001b[0;34m(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)\u001b[0m\n\u001b[1;32m 1099\u001b[0m \u001b[0mna_values\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mna_values\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1100\u001b[0m \u001b[0mkeep_default_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkeep_default_na\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1101\u001b[0;31m \u001b[0mdisplayed_only\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdisplayed_only\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1102\u001b[0m )\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 915\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 916\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 917\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mretained\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 918\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse\u001b[0;34m(flavor, io, match, attrs, encoding, displayed_only, **kwargs)\u001b[0m\n\u001b[1;32m 896\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 897\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 898\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 899\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcaught\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0;31m# if `io` is an io-like object, check if it's seekable\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36mparse_tables\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mlist\u001b[0m \u001b[0mof\u001b[0m \u001b[0mparsed\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfooter\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0mtuples\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 216\u001b[0m \"\"\"\n\u001b[0;32m--> 217\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_doc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 218\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse_thead_tbody_tfoot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtable\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/pandas/io/html.py\u001b[0m in \u001b[0;36m_parse_tables\u001b[0;34m(self, doc, match, attrs)\u001b[0m\n\u001b[1;32m 545\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 546\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtables\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 547\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No tables found\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 548\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mValueError\u001b[0m: No tables found" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "dfs = []\n", | |
| "for parkrun in tqdm(russian_parkruns):\n", | |
| " time.sleep(3 + 5*random.random())\n", | |
| " parkrun_trim = re.sub(r'[\\s-]', '', parkrun)\n", | |
| " url = f'https://www.parkrun.ru/{parkrun_trim}/results/clubhistory/?clubNum={club_id}'\n", | |
| " headers['User-Agent'] = user_agent_rotator.get_random_user_agent()\n", | |
| " club_results = requests.get(url, headers=headers)\n", | |
| " try:\n", | |
| " df = pd.read_html(club_results.text)[0]\n", | |
| " dfs.append(df[df.columns[0]])\n", | |
| " if parkrun == home_parkrun:\n", | |
| " home_list = df[df.columns[0]]\n", | |
| " except:\n", | |
| " print('ОШИБКА - операция завершилась досрочно. Паркран временно заблокировал IP.')\n", | |
| " break" | |
| ], | |
| "metadata": { | |
| "id": "pB86PSu40JiI" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "def last_name_first(full_name: str):\n", | |
| " names = full_name.split()\n", | |
| " last_name = names.pop(1).capitalize()\n", | |
| " names.insert(0, last_name)\n", | |
| " return ' '.join(names)" | |
| ], | |
| "metadata": { | |
| "id": "JGv4TY_5LOho" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "full_list = pd.concat(dfs).drop_duplicates(keep='last')\n", | |
| "outer_home_list = pd.concat([full_list, home_list]).apply(last_name_first).sort_values().drop_duplicates(keep=False).reset_index(drop=True)" | |
| ], | |
| "metadata": { | |
| "id": "u-ij29bnAOaq" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Одноклубники, не бегавшие на домашнем паркране:" | |
| ], | |
| "metadata": { | |
| "id": "QmpqdvUI8Hsx" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "outer_home_list.shift(1, fill_value='________Фамилия_Имя___')" | |
| ], | |
| "metadata": { | |
| "id": "526myT8CFL74" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "### Полный список одноклубников:" | |
| ], | |
| "metadata": { | |
| "id": "5Ka2TdEp8a4_" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "full_list.apply(last_name_first).sort_values().reset_index(drop=True).shift(1, fill_value='________Фамилия_Имя___')" | |
| ], | |
| "metadata": { | |
| "id": "ImhNvi-aS6I_" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment