Created
October 27, 2016 13:50
-
-
Save ledmonster/b65560f20e14c67da81f7eb067af995c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import re" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_csv('data/tweets_20161027.tsv', sep='\\t')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>@user_name texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext http://url</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>【交換】</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 @user_name texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext http://url\n", | |
| "5 1 2 3 0.4 texttext\n", | |
| "6 1 2 3 0.4 【交換】" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def is_rt(tweet):\n", | |
| " matched_texts = rt_pattern.findall(tweet) \n", | |
| " is_retweet = len(matched_texts) > 0 \n", | |
| " return is_retweet" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "rt_pattern = re.compile(r'^RT\\s(.*)')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "result = df.ix[\n", | |
| " (~df['tweet'].apply(is_rt)) & \n", | |
| " (df['num4'] < 0.8) \n", | |
| "] " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>@user_name texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>【交換】</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 @user_name texttext #tag #tag\n", | |
| "5 1 2 3 0.4 texttext\n", | |
| "6 1 2 3 0.4 【交換】" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### @マークを除く" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re_atmark = re.compile(r'@[A-Za-z0-9_]+')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'NAME texttext #tag #tag'" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "re_atmark.sub('NAME', '@user_name texttext #tag #tag')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'こんにちは さん、 さん'" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "re_atmark.sub('', 'こんにちは @junya さん、@hiro さん')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def remove_atmark(tweet):\n", | |
| " return re_atmark.sub('', tweet).strip()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 texttext #tag #tag\n", | |
| "1 texttext #tag\n", | |
| "2 RT texttext\n", | |
| "3 RT texttext\n", | |
| "4 texttext http://url\n", | |
| "5 texttext\n", | |
| "6 【交換】\n", | |
| "Name: tweet, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| " df['tweet'].apply(remove_atmark)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df2 = df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df2['tweet'] = df['tweet'].apply(remove_atmark)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext http://url</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>【交換】</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext http://url\n", | |
| "5 1 2 3 0.4 texttext\n", | |
| "6 1 2 3 0.4 【交換】" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df2" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'_'=='_'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'abcc'" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "' abcc '.strip()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### 譲渡系アカウントを外す" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re_ng_word = re.compile(r'交換')\n", | |
| "def has_ng_word(tweet):\n", | |
| " return tweet.find('交換')>0\n", | |
| " # return bool(re_ng_word.match(tweet))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "text = 'アカウント交換してください'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "5" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "text.find('交換') " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re_ng_word = re.compile(r'交換|置換|売買')\n", | |
| "def has_ng_word2(tweet):\n", | |
| " return bool(re_ng_word.search(tweet))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'アカウント交換してください'" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "text" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re.match?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 True\n", | |
| "1 True\n", | |
| "2 True\n", | |
| "3 True\n", | |
| "4 True\n", | |
| "5 True\n", | |
| "6 False\n", | |
| "Name: tweet, dtype: bool" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "~df2['tweet'].apply(has_ng_word2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df3 = df2[~df2['tweet'].apply(has_ng_word2)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext http://url</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext http://url\n", | |
| "5 1 2 3 0.4 texttext" | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### ツイートの長さ" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 False\n", | |
| "1 False\n", | |
| "2 False\n", | |
| "3 False\n", | |
| "4 False\n", | |
| "5 True\n", | |
| "Name: tweet, dtype: bool" | |
| ] | |
| }, | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3['tweet'].str.len() < 10" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext http://url</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext http://url" | |
| ] | |
| }, | |
| "execution_count": 30, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3[(df3['tweet'].str.len() >= 10)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext http://url</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext http://url" | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3[(df3['tweet'].str.len() >= 10) & (df3['tweet'].str.len() >= 10)]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### URL を外す" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re_url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'before URL abc'" | |
| ] | |
| }, | |
| "execution_count": 33, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "re_url.sub('URL', 'before http://www.google.com/abc abc')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def remove_url(tweet):\n", | |
| " return re_url.sub('', tweet)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 texttext #tag #tag\n", | |
| "1 texttext #tag\n", | |
| "2 RT texttext\n", | |
| "3 RT texttext\n", | |
| "4 texttext \n", | |
| "5 texttext\n", | |
| "Name: tweet, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 35, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3['tweet'].apply(remove_url)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import copy\n", | |
| "df4 = copy.copy(df3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df4['tweet'] = df3['tweet'].apply(remove_url)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>num1</th>\n", | |
| " <th>num2</th>\n", | |
| " <th>num3</th>\n", | |
| " <th>num4</th>\n", | |
| " <th>tweet</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext #tag #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext #tag</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>RT texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.8</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>1</td>\n", | |
| " <td>2</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.4</td>\n", | |
| " <td>texttext</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " num1 num2 num3 num4 tweet\n", | |
| "0 1 2 3 0.4 texttext #tag #tag\n", | |
| "1 1 2 3 0.8 texttext #tag\n", | |
| "2 0 2 3 0.4 RT texttext\n", | |
| "3 1 2 3 0.8 RT texttext\n", | |
| "4 1 2 3 0.8 texttext \n", | |
| "5 1 2 3 0.4 texttext" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df4" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### タグを除く" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "re_tag = re.compile(r'<[^>]+>')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'aaa link ddd'" | |
| ] | |
| }, | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "re_tag.sub('', 'aaa <a href=\"fff\">link</a> ddd')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def remove_tag(tweet):\n", | |
| " return re_tag.sub('', tweet)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 texttext #tag #tag\n", | |
| "1 texttext #tag\n", | |
| "2 RT texttext\n", | |
| "3 RT texttext\n", | |
| "4 texttext \n", | |
| "5 texttext\n", | |
| "Name: tweet, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3['tweet'].apply(remove_tag)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 62, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "tweet = 'hello <a href=\">aaa\"> foo'\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 63, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'hello aaa\"> foo'" | |
| ] | |
| }, | |
| "execution_count": 63, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "remove_tag(tweet)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 64, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import lxml" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 65, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import lxml.html" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 66, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'hello foo'" | |
| ] | |
| }, | |
| "execution_count": 66, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "lxml.html.fromstring(tweet).text_content()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 67, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def remove_tag2(tweet):\n", | |
| " return lxml.html.fromstring(tweet).text_content()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 68, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 texttext #tag #tag\n", | |
| "1 texttext #tag\n", | |
| "2 RT texttext\n", | |
| "3 RT texttext\n", | |
| "4 texttext \n", | |
| "5 texttext\n", | |
| "Name: tweet, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 68, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df3['tweet'].apply(remove_tag2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment