Created
April 3, 2016 00:19
-
-
Save Amareis/b0a723e698783929c5b1f6c81d412f85 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 74, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib notebook\n", | |
| "\n", | |
| "import matplotlib\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import pandas as pd, numpy as np, requests, json\n", | |
| "\n", | |
| "from tqdm import tqdm\n", | |
| "from pandas.io.json import json_normalize\n", | |
| "\n", | |
| "matplotlib.style.use('ggplot')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from html.parser import HTMLParser\n", | |
| "\n", | |
| "class MLStripper(HTMLParser):\n", | |
| " def __init__(self):\n", | |
| " self.reset()\n", | |
| " self.strict = False\n", | |
| " self.convert_charrefs= True\n", | |
| " self.fed = []\n", | |
| " def handle_data(self, d):\n", | |
| " self.fed.append(d)\n", | |
| " def get_data(self):\n", | |
| " return ''.join(self.fed)\n", | |
| "\n", | |
| "def strip_tags(html):\n", | |
| " html = html.replace('<br>', '\\n')\n", | |
| " s = MLStripper()\n", | |
| " s.feed(html)\n", | |
| " return s.get_data()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "('https://2ch.hk/b/catalog.json', 200)" | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "board = 'b'\n", | |
| "url = 'https://2ch.hk/{}/catalog.json'.format(board)\n", | |
| "resp = requests.get(url)\n", | |
| "url, resp.status_code" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "ops_json = json.loads(resp.text)['threads']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>comment</th>\n", | |
| " <th>date</th>\n", | |
| " <th>files_count</th>\n", | |
| " <th>num</th>\n", | |
| " <th>posts_count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 11:37:27</td>\n", | |
| " <td>0</td>\n", | |
| " <td>122120582</td>\n", | |
| " <td>63</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 16:45:30</td>\n", | |
| " <td>62</td>\n", | |
| " <td>122150646</td>\n", | |
| " <td>189</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Камвхоры и неймфаги двачей/тиречей/сосачей/хар...</td>\n", | |
| " <td>2016-04-02 18:58:13</td>\n", | |
| " <td>67</td>\n", | |
| " <td>122164310</td>\n", | |
| " <td>242</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 20:15:14</td>\n", | |
| " <td>1</td>\n", | |
| " <td>122171916</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>ЦУЬИ WEBM ВЕЧЕРНИЙ/НОЧНОЙ</td>\n", | |
| " <td>2016-04-02 19:20:06</td>\n", | |
| " <td>291</td>\n", | |
| " <td>122166381</td>\n", | |
| " <td>470</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " comment date \\\n", | |
| "0 2016-04-02 11:37:27 \n", | |
| "1 2016-04-02 16:45:30 \n", | |
| "2 Камвхоры и неймфаги двачей/тиречей/сосачей/хар... 2016-04-02 18:58:13 \n", | |
| "3 2016-04-02 20:15:14 \n", | |
| "4 ЦУЬИ WEBM ВЕЧЕРНИЙ/НОЧНОЙ 2016-04-02 19:20:06 \n", | |
| "\n", | |
| " files_count num posts_count \n", | |
| "0 0 122120582 63 \n", | |
| "1 62 122150646 189 \n", | |
| "2 67 122164310 242 \n", | |
| "3 1 122171916 3 \n", | |
| "4 291 122166381 470 " | |
| ] | |
| }, | |
| "execution_count": 44, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "op_droplist = ['subject', 'tags', 'email', 'banned', 'closed', 'op', 'parent',\n", | |
| " 'sticky', 'hidden_num', 'trip', 'name', 'files', 'lasthit', 'timestamp']\n", | |
| "ops = json_normalize(ops_json)\n", | |
| "ops['date'] = pd.to_datetime(ops['timestamp'], unit='s')\n", | |
| "ops['comment'] = ops['comment'].apply(strip_tags)\n", | |
| "ops.drop(op_droplist, 1, inplace=1)\n", | |
| "ops.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>duration</th>\n", | |
| " <th>height</th>\n", | |
| " <th>name</th>\n", | |
| " <th>size</th>\n", | |
| " <th>tn_height</th>\n", | |
| " <th>tn_width</th>\n", | |
| " <th>type</th>\n", | |
| " <th>width</th>\n", | |
| " <th>num</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td></td>\n", | |
| " <td>640</td>\n", | |
| " <td>14595970473540.jpg</td>\n", | |
| " <td>60</td>\n", | |
| " <td>220</td>\n", | |
| " <td>165</td>\n", | |
| " <td>1</td>\n", | |
| " <td>480</td>\n", | |
| " <td>122120582</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>00:00:32</td>\n", | |
| " <td>1080</td>\n", | |
| " <td>14596155301500.webm</td>\n", | |
| " <td>3298</td>\n", | |
| " <td>123</td>\n", | |
| " <td>220</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1920</td>\n", | |
| " <td>122150646</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td></td>\n", | |
| " <td>2592</td>\n", | |
| " <td>14596234938300.jpg</td>\n", | |
| " <td>765</td>\n", | |
| " <td>220</td>\n", | |
| " <td>164</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1936</td>\n", | |
| " <td>122164310</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td></td>\n", | |
| " <td>336</td>\n", | |
| " <td>14596281143980.jpg</td>\n", | |
| " <td>26</td>\n", | |
| " <td>120</td>\n", | |
| " <td>220</td>\n", | |
| " <td>1</td>\n", | |
| " <td>615</td>\n", | |
| " <td>122171916</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>00:01:27</td>\n", | |
| " <td>360</td>\n", | |
| " <td>14596248064220.webm</td>\n", | |
| " <td>5525</td>\n", | |
| " <td>165</td>\n", | |
| " <td>220</td>\n", | |
| " <td>6</td>\n", | |
| " <td>480</td>\n", | |
| " <td>122166381</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " duration height name size tn_height tn_width type \\\n", | |
| "0 640 14595970473540.jpg 60 220 165 1 \n", | |
| "1 00:00:32 1080 14596155301500.webm 3298 123 220 6 \n", | |
| "2 2592 14596234938300.jpg 765 220 164 1 \n", | |
| "3 336 14596281143980.jpg 26 120 220 1 \n", | |
| "4 00:01:27 360 14596248064220.webm 5525 165 220 6 \n", | |
| "\n", | |
| " width num \n", | |
| "0 480 122120582 \n", | |
| "1 1920 122150646 \n", | |
| "2 1936 122164310 \n", | |
| "3 615 122171916 \n", | |
| "4 480 122166381 " | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "opfiles = json_normalize(ops_json, 'files', ['num']).drop(['md5', 'nsfw', 'path', 'thumbnail'], 1)\n", | |
| "opfiles.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'17869 posts loaded from 147 threads!'" | |
| ] | |
| }, | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "posts_json = []\n", | |
| "exist_ops = 0\n", | |
| "for i, n in tqdm(enumerate(reversed(ops.num)), mininterval=0.1, total=len(ops), leave=True):\n", | |
| " turl = 'https://2ch.hk/{}/res/{}.json'.format(board, n)\n", | |
| " resp = requests.get(turl)\n", | |
| " if resp.status_code == 200:\n", | |
| " ps = json.loads(resp.text)['threads'][0]['posts']\n", | |
| " posts_json.extend(ps)\n", | |
| " exist_ops += 1\n", | |
| "'{} posts loaded from {} threads'.format(len(posts_json), exist_ops)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>comment</th>\n", | |
| " <th>date</th>\n", | |
| " <th>num</th>\n", | |
| " <th>number</th>\n", | |
| " <th>parent</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>http://slither.io</td>\n", | |
| " <td>2016-04-02 16:06:27</td>\n", | |
| " <td>122146512</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 16:25:44</td>\n", | |
| " <td>122148604</td>\n", | |
| " <td>2</td>\n", | |
| " <td>122146512</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 16:33:31</td>\n", | |
| " <td>122149359</td>\n", | |
| " <td>3</td>\n", | |
| " <td>122146512</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>>>122149359</td>\n", | |
| " <td>2016-04-02 16:34:54</td>\n", | |
| " <td>122149503</td>\n", | |
| " <td>4</td>\n", | |
| " <td>122146512</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td></td>\n", | |
| " <td>2016-04-02 16:35:33</td>\n", | |
| " <td>122149579</td>\n", | |
| " <td>5</td>\n", | |
| " <td>122146512</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " comment date num number parent\n", | |
| "0 http://slither.io 2016-04-02 16:06:27 122146512 1 0\n", | |
| "1 2016-04-02 16:25:44 122148604 2 122146512\n", | |
| "2 2016-04-02 16:33:31 122149359 3 122146512\n", | |
| "3 >>122149359 2016-04-02 16:34:54 122149503 4 122146512\n", | |
| "4 2016-04-02 16:35:33 122149579 5 122146512" | |
| ] | |
| }, | |
| "execution_count": 61, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "post_droplist = ['tags', 'subject', 'email', 'banned', 'closed', 'lasthit', 'op',\n", | |
| " 'sticky', 'hidden_num', 'trip', 'name', 'files', 'timestamp']\n", | |
| "posts = json_normalize(posts_json)\n", | |
| "posts['date'] = pd.to_datetime(posts['timestamp'], unit='s')\n", | |
| "posts['comment'] = posts['comment'].apply(strip_tags)\n", | |
| "posts.drop(post_droplist, 1, inplace=1)\n", | |
| "posts.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>duration</th>\n", | |
| " <th>height</th>\n", | |
| " <th>name</th>\n", | |
| " <th>size</th>\n", | |
| " <th>tn_height</th>\n", | |
| " <th>tn_width</th>\n", | |
| " <th>type</th>\n", | |
| " <th>width</th>\n", | |
| " <th>num</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td></td>\n", | |
| " <td>661</td>\n", | |
| " <td>14596131871290.png</td>\n", | |
| " <td>1372</td>\n", | |
| " <td>111</td>\n", | |
| " <td>220</td>\n", | |
| " <td>2</td>\n", | |
| " <td>1302</td>\n", | |
| " <td>122146512</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td></td>\n", | |
| " <td>589</td>\n", | |
| " <td>14596196055160.jpg</td>\n", | |
| " <td>87</td>\n", | |
| " <td>125</td>\n", | |
| " <td>170</td>\n", | |
| " <td>1</td>\n", | |
| " <td>800</td>\n", | |
| " <td>122157798</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td></td>\n", | |
| " <td>147</td>\n", | |
| " <td>14596251721600.jpg</td>\n", | |
| " <td>9</td>\n", | |
| " <td>147</td>\n", | |
| " <td>220</td>\n", | |
| " <td>1</td>\n", | |
| " <td>220</td>\n", | |
| " <td>122166974</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td></td>\n", | |
| " <td>726</td>\n", | |
| " <td>14595554375820.png</td>\n", | |
| " <td>204</td>\n", | |
| " <td>220</td>\n", | |
| " <td>93</td>\n", | |
| " <td>2</td>\n", | |
| " <td>308</td>\n", | |
| " <td>122084428</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td></td>\n", | |
| " <td>1068</td>\n", | |
| " <td>14595555467730.jpg</td>\n", | |
| " <td>257</td>\n", | |
| " <td>113</td>\n", | |
| " <td>170</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1600</td>\n", | |
| " <td>122084533</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " duration height name size tn_height tn_width type \\\n", | |
| "0 661 14596131871290.png 1372 111 220 2 \n", | |
| "1 589 14596196055160.jpg 87 125 170 1 \n", | |
| "2 147 14596251721600.jpg 9 147 220 1 \n", | |
| "3 726 14595554375820.png 204 220 93 2 \n", | |
| "4 1068 14595555467730.jpg 257 113 170 1 \n", | |
| "\n", | |
| " width num \n", | |
| "0 1302 122146512 \n", | |
| "1 800 122157798 \n", | |
| "2 220 122166974 \n", | |
| "3 308 122084428 \n", | |
| "4 1600 122084533 " | |
| ] | |
| }, | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "postfiles = json_normalize(posts_json, 'files', ['num']).drop(['md5', 'nsfw', 'path', 'thumbnail'], 1)\n", | |
| "postfiles.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(Timestamp('2016-04-02 20:21:21'), 17869, 6776)" | |
| ] | |
| }, | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "posts.date[len(posts)-1], len(posts), len(postfiles)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 317, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import re\n", | |
| "#[\\s\\,\\!\\?\\.\\>\\\"\\'\\%\\«\\:]|\n", | |
| "spl = posts.comment.str.split(r'https?://.+|\\d+|\\(OP\\)|[^\\w+]').apply(lambda x: x if x else None).dropna()\n", | |
| "t = []\n", | |
| "for l in spl:\n", | |
| " t.extend(x.lower() for x in l)\n", | |
| "spl = pd.Series(t).apply(\n", | |
| " lambda x: None\n", | |
| " if not x\n", | |
| " or x.startswith('>>')\n", | |
| " or len(x) < 5\n", | |
| " else x\n", | |
| ").dropna()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 326, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>comment</th>\n", | |
| " <th>date</th>\n", | |
| " <th>num</th>\n", | |
| " <th>number</th>\n", | |
| " <th>parent</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>517</th>\n", | |
| " <td>Буду отписывать в 30 и 00 минут каждого часа (...</td>\n", | |
| " <td>2016-04-02 15:29:52</td>\n", | |
| " <td>122143012</td>\n", | |
| " <td>39</td>\n", | |
| " <td>122135985</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3460</th>\n", | |
| " <td>Сап, двачмурмурмур</td>\n", | |
| " <td>2016-04-02 19:04:39</td>\n", | |
| " <td>122164866</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4016</th>\n", | |
| " <td>Сап, двач. Самки совсем не привлекаюткуны, соб...</td>\n", | |
| " <td>2016-04-02 19:17:06</td>\n", | |
| " <td>122166050</td>\n", | |
| " <td>43</td>\n", | |
| " <td>122162369</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4469</th>\n", | |
| " <td>>>122145686 (OP)\\n>2007 – это когда ты молод, ...</td>\n", | |
| " <td>2016-04-02 18:26:16</td>\n", | |
| " <td>122161114</td>\n", | |
| " <td>58</td>\n", | |
| " <td>122145686</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5949</th>\n", | |
| " <td>Сап, братья.\\n\\nhttps://clyp.it/4kxvftup</td>\n", | |
| " <td>2016-04-01 18:38:07</td>\n", | |
| " <td>122054030</td>\n", | |
| " <td>127</td>\n", | |
| " <td>122036249</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6524</th>\n", | |
| " <td>>>122081897\\nСап, братья. Иногда мне кажется, ...</td>\n", | |
| " <td>2016-04-01 23:35:12</td>\n", | |
| " <td>122082655</td>\n", | |
| " <td>192</td>\n", | |
| " <td>122053833</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7849</th>\n", | |
| " <td>Сап б, сегодня вспомнил про свой аккаунт в Ori...</td>\n", | |
| " <td>2016-04-02 19:24:13</td>\n", | |
| " <td>122166782</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9101</th>\n", | |
| " <td>Трейнсёрфер в треде. Катаю сапсаны раз в недел...</td>\n", | |
| " <td>2016-04-02 13:51:34</td>\n", | |
| " <td>122133188</td>\n", | |
| " <td>302</td>\n", | |
| " <td>122106787</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11383</th>\n", | |
| " <td>НЕТ СИЛ НА ДОСТИЖЕНИЯ\\nСап, /b/. Хуй 26 уровня...</td>\n", | |
| " <td>2016-04-02 18:14:00</td>\n", | |
| " <td>122159945</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12464</th>\n", | |
| " <td>сап двач. я нюхнул одну дорожку фена. больше ...</td>\n", | |
| " <td>2016-04-02 19:47:11</td>\n", | |
| " <td>122169166</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13167</th>\n", | |
| " <td>ВкатилсяДля разговоров с Мэри и Кюри по душам\\...</td>\n", | |
| " <td>2016-04-02 17:38:44</td>\n", | |
| " <td>122156392</td>\n", | |
| " <td>8</td>\n", | |
| " <td>122155330</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13420</th>\n", | |
| " <td>>>122169408\\nИнвентарь:\\n5 серебряных монет\\nС...</td>\n", | |
| " <td>2016-04-02 19:54:12</td>\n", | |
| " <td>122169845</td>\n", | |
| " <td>261</td>\n", | |
| " <td>122155330</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13426</th>\n", | |
| " <td>>>122169801\\nДворф второй ботинок поставил тор...</td>\n", | |
| " <td>2016-04-02 19:59:49</td>\n", | |
| " <td>122170393</td>\n", | |
| " <td>267</td>\n", | |
| " <td>122155330</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13452</th>\n", | |
| " <td>>>122170719\\nДворф угомонился, поймал бутылку ...</td>\n", | |
| " <td>2016-04-02 20:18:07</td>\n", | |
| " <td>122172154</td>\n", | |
| " <td>293</td>\n", | |
| " <td>122155330</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13990</th>\n", | |
| " <td>Сап, анон.\\nСегодня прикупил себе аккаунтов.\\n...</td>\n", | |
| " <td>2016-04-02 20:02:52</td>\n", | |
| " <td>122170690</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14306</th>\n", | |
| " <td>>>122161413\\nиз сапога резинового</td>\n", | |
| " <td>2016-04-02 18:36:09</td>\n", | |
| " <td>122162105</td>\n", | |
| " <td>36</td>\n", | |
| " <td>122160914</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " comment date \\\n", | |
| "517 Буду отписывать в 30 и 00 минут каждого часа (... 2016-04-02 15:29:52 \n", | |
| "3460 Сап, двачмурмурмур 2016-04-02 19:04:39 \n", | |
| "4016 Сап, двач. Самки совсем не привлекаюткуны, соб... 2016-04-02 19:17:06 \n", | |
| "4469 >>122145686 (OP)\\n>2007 – это когда ты молод, ... 2016-04-02 18:26:16 \n", | |
| "5949 Сап, братья.\\n\\nhttps://clyp.it/4kxvftup 2016-04-01 18:38:07 \n", | |
| "6524 >>122081897\\nСап, братья. Иногда мне кажется, ... 2016-04-01 23:35:12 \n", | |
| "7849 Сап б, сегодня вспомнил про свой аккаунт в Ori... 2016-04-02 19:24:13 \n", | |
| "9101 Трейнсёрфер в треде. Катаю сапсаны раз в недел... 2016-04-02 13:51:34 \n", | |
| "11383 НЕТ СИЛ НА ДОСТИЖЕНИЯ\\nСап, /b/. Хуй 26 уровня... 2016-04-02 18:14:00 \n", | |
| "12464 сап двач. я нюхнул одну дорожку фена. больше ... 2016-04-02 19:47:11 \n", | |
| "13167 ВкатилсяДля разговоров с Мэри и Кюри по душам\\... 2016-04-02 17:38:44 \n", | |
| "13420 >>122169408\\nИнвентарь:\\n5 серебряных монет\\nС... 2016-04-02 19:54:12 \n", | |
| "13426 >>122169801\\nДворф второй ботинок поставил тор... 2016-04-02 19:59:49 \n", | |
| "13452 >>122170719\\nДворф угомонился, поймал бутылку ... 2016-04-02 20:18:07 \n", | |
| "13990 Сап, анон.\\nСегодня прикупил себе аккаунтов.\\n... 2016-04-02 20:02:52 \n", | |
| "14306 >>122161413\\nиз сапога резинового 2016-04-02 18:36:09 \n", | |
| "\n", | |
| " num number parent \n", | |
| "517 122143012 39 122135985 \n", | |
| "3460 122164866 1 0 \n", | |
| "4016 122166050 43 122162369 \n", | |
| "4469 122161114 58 122145686 \n", | |
| "5949 122054030 127 122036249 \n", | |
| "6524 122082655 192 122053833 \n", | |
| "7849 122166782 1 0 \n", | |
| "9101 122133188 302 122106787 \n", | |
| "11383 122159945 1 0 \n", | |
| "12464 122169166 1 0 \n", | |
| "13167 122156392 8 122155330 \n", | |
| "13420 122169845 261 122155330 \n", | |
| "13426 122170393 267 122155330 \n", | |
| "13452 122172154 293 122155330 \n", | |
| "13990 122170690 1 0 \n", | |
| "14306 122162105 36 122160914 " | |
| ] | |
| }, | |
| "execution_count": 326, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "posts[posts.comment.str.contains('сап', case=False)]" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.4.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment