Last active
October 30, 2016 02:31
-
-
Save dspp779/04098c47af27b0a7b5c3f68db791971f to your computer and use it in GitHub Desktop.
Practice for khub
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Python 基本資料處理" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### Read file line by line" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sentences = []\n", | |
| "for line in open('sentences.txt'):\n", | |
| " # 刪減前後的空白與換行\n", | |
| " line = line.strip()\n", | |
| " # 將處理好的字串加入 sentences \n", | |
| " sentences.append(line)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(sentences[0])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "string index" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### or you can just write" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# list comprehension\n", | |
| "sentences = [line.strip() for line in open('sentences.txt')]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(sentences[0])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### String operations" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sentence = \"I want to eat an apple .\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### string character can be access like list elements" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'t'" | |
| ] | |
| }, | |
| "execution_count": 54, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence[5]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "eat\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sentence[10:13]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'.'" | |
| ] | |
| }, | |
| "execution_count": 56, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence[-1]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'eat an appl'" | |
| ] | |
| }, | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence[10:-3]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### find sequences in string" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 68, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3" | |
| ] | |
| }, | |
| "execution_count": 68, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.find('a')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "find from right-hand side" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 67, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "17" | |
| ] | |
| }, | |
| "execution_count": 67, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.rfind('a')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "find with a starting point" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 71, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "11" | |
| ] | |
| }, | |
| "execution_count": 71, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.find('a', 4)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "return -1 when not found" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 63, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "-1" | |
| ] | |
| }, | |
| "execution_count": 63, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.find('can')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "combine the use of subsequence and find" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 65, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'want to eat an apple '" | |
| ] | |
| }, | |
| "execution_count": 65, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence[sentence.find('want to'):sentence.rfind('.')]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### String Normalization" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 80, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'I want to eat an apple .'" | |
| ] | |
| }, | |
| "execution_count": 80, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 81, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'i want to eat an apple .'" | |
| ] | |
| }, | |
| "execution_count": 81, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.lower()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 82, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'I WANT TO EAT AN APPLE .'" | |
| ] | |
| }, | |
| "execution_count": 82, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.upper()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 83, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'I want to eat an apple .'" | |
| ] | |
| }, | |
| "execution_count": 83, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.capitalize()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 86, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 86, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'A'.isupper()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 87, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 87, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'A'.islower()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 92, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 92, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'apple'.isalpha()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 93, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 93, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'20'.isdigit()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 94, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 94, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'20.9'.isdigit()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 99, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 99, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'20'.isdecimal()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 101, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 101, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'furen5566'.isalnum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### split sentence by blank" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 78, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['I', 'want', 'to', 'eat', 'an', 'apple', '.']" | |
| ] | |
| }, | |
| "execution_count": 78, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# the result is list of words in the sentence\n", | |
| "sentence.split(' ')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 102, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 102, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.endswith('.')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 103, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 103, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sentence.startswith('He wants')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Dictionary examples" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 104, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "book = dict()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 106, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "book['title'] = 'Natural Language Processing with Python'\n", | |
| "book['author'] = 'Bird, Klein, and Loper'\n", | |
| "book['year'] = 2009" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 107, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'author': 'Bird, Klein, and Loper',\n", | |
| " 'title': 'Natural Language Processing with Python',\n", | |
| " 'year': 2009}" | |
| ] | |
| }, | |
| "execution_count": 107, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "book" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 112, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "dict_keys(['year', 'title', 'author'])" | |
| ] | |
| }, | |
| "execution_count": 112, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "book.keys()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 113, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "dict_values([2009, 'Natural Language Processing with Python', 'Bird, Klein, and Loper'])" | |
| ] | |
| }, | |
| "execution_count": 113, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "book.values()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 114, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "dict_items([('year', 2009), ('title', 'Natural Language Processing with Python'), ('author', 'Bird, Klein, and Loper')])" | |
| ] | |
| }, | |
| "execution_count": 114, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "book.items()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "string formatting" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 108, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'" | |
| ] | |
| }, | |
| "execution_count": 108, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 109, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'" | |
| ] | |
| }, | |
| "execution_count": 109, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# advanced formatting\n", | |
| "'{title} is a book written by {author} in {year}'.format(**book)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Counting Example" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 123, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 124, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "counter = dict()\n", | |
| "for color in data:\n", | |
| " if color in counter:\n", | |
| " counter[color] += 1\n", | |
| " else:\n", | |
| " counter[color] = 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 125, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'blue': 2, 'red': 4, 'yellow': 3}" | |
| ] | |
| }, | |
| "execution_count": 125, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### use default dictionary" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 129, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from collections import defaultdict\n", | |
| "counter = defaultdict(lambda: 0) # default value function is 0\n", | |
| "counter = defaultdict(int) # default value function is \"int\", which initialize to 0" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 130, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "for color in data:\n", | |
| " counter[color] += 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 131, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})" | |
| ] | |
| }, | |
| "execution_count": 131, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### use built-in Counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 159, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from collections import Counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 160, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "counter = Counter(data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 161, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Counter({'blue': 2, 'red': 4, 'yellow': 3})" | |
| ] | |
| }, | |
| "execution_count": 161, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 162, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']\n", | |
| "counter.update(new_data)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 163, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Counter({'blue': 7, 'red': 5, 'yellow': 6})" | |
| ] | |
| }, | |
| "execution_count": 163, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "#### most common elements" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 164, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('blue', 7), ('yellow', 6), ('red', 5)]" | |
| ] | |
| }, | |
| "execution_count": 164, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter.most_common()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 165, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('blue', 7), ('yellow', 6)]" | |
| ] | |
| }, | |
| "execution_count": 165, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "counter.most_common(2)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 166, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "blue: 7\n", | |
| "yellow: 6\n", | |
| "red: 5\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for color, count in counter.most_common():\n", | |
| " print('{0}: {1}'.format(color, count))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 147, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "0\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# clear counter\n", | |
| "counter.clear()\n", | |
| "print(counter['blue'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Exercise" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "compute the word frequencies in \"sentences.txt\"\n", | |
| "- read sentences from file \"sentences.txt\"\n", | |
| "- split sentences into words (split)\n", | |
| "- filter out symbols (isalpha, isdigit, isalnum)\n", | |
| "- normalize words and count ('Word' and 'word' are considered as the same word)\n", | |
| "- count the occurance of words (counting exmaple)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "write your code here" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 167, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 169, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('the', 325885),\n", | |
| " ('of', 160369),\n", | |
| " ('to', 152595),\n", | |
| " ('in', 126200),\n", | |
| " ('and', 114420),\n", | |
| " ('a', 108957),\n", | |
| " ('The', 61956),\n", | |
| " ('is', 58325),\n", | |
| " ('that', 55133),\n", | |
| " ('for', 51248),\n", | |
| " ('s', 40005),\n", | |
| " ('was', 35760),\n", | |
| " ('on', 35210),\n", | |
| " ('are', 33740),\n", | |
| " ('with', 31171),\n", | |
| " ('have', 29856),\n", | |
| " ('said', 26795),\n", | |
| " ('has', 26629),\n", | |
| " ('from', 26454),\n", | |
| " ('it', 26116)]" | |
| ] | |
| }, | |
| "execution_count": 169, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "wordCounter.most_common(20)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 1 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment