dspp779 · October 30, 2016 02:31
diff --git a/Practice161020.ipynb b/Practice161020.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python 基本資料處理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Read file line by line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sentences = []\n",
    "for line in open('sentences.txt'):\n",
    "    # 刪減前後的空白與換行\n",
    "    line = line.strip()\n",
    "    # 將處理好的字串加入 sentences \n",
    "    sentences.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n"
     ]
    }
   ],
   "source": [
    "print(sentences[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "string index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### or you can just write"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# list comprehension\n",
    "sentences = [line.strip() for line in open('sentences.txt')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n"
     ]
    }
   ],
   "source": [
    "print(sentences[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### String operations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sentence = \"I want to eat an apple .\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### string character can be access like list elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'t'"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence[5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "eat\n"
     ]
    }
   ],
   "source": [
    "sentence[10:13]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'.'"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'eat an appl'"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence[10:-3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### find sequences in string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.find('a')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "find from right-hand side"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.rfind('a')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "find with a starting point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "11"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.find('a', 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "return -1 when not found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.find('can')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "combine the use of subsequence and find"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'want to eat an apple '"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence[sentence.find('want to'):sentence.rfind('.')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### String Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I want to eat an apple .'"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'i want to eat an apple .'"
      ]
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.lower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I WANT TO EAT AN APPLE .'"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.upper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'I want to eat an apple .'"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'A'.isupper()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'A'.islower()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'apple'.isalpha()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'20'.isdigit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 94,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'20.9'.isdigit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'20'.isdecimal()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'furen5566'.isalnum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### split sentence by blank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['I', 'want', 'to', 'eat', 'an', 'apple', '.']"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the result is list of words in the sentence\n",
    "sentence.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.endswith('.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentence.startswith('He wants')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dictionary examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "book = dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "book['title'] = 'Natural Language Processing with Python'\n",
    "book['author'] = 'Bird, Klein, and Loper'\n",
    "book['year'] = 2009"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'author': 'Bird, Klein, and Loper',\n",
       " 'title': 'Natural Language Processing with Python',\n",
       " 'year': 2009}"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['year', 'title', 'author'])"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_values([2009, 'Natural Language Processing with Python', 'Bird, Klein, and Loper'])"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book.values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_items([('year', 2009), ('title', 'Natural Language Processing with Python'), ('author', 'Bird, Klein, and Loper')])"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book.items()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "string formatting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# advanced formatting\n",
    "'{title} is a book written by {author} in {year}'.format(**book)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Counting Example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counter = dict()\n",
    "for color in data:\n",
    "    if color in counter:\n",
    "        counter[color] += 1\n",
    "    else:\n",
    "        counter[color] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'blue': 2, 'red': 4, 'yellow': 3}"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### use default dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "counter = defaultdict(lambda: 0)  # default value function is 0\n",
    "counter = defaultdict(int)  # default value function is \"int\", which initialize to 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for color in data:\n",
    "    counter[color] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### use built-in Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "counter = Counter(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'blue': 2, 'red': 4, 'yellow': 3})"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']\n",
    "counter.update(new_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'blue': 7, 'red': 5, 'yellow': 6})"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### most common elements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('blue', 7), ('yellow', 6), ('red', 5)]"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter.most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('blue', 7), ('yellow', 6)]"
      ]
     },
     "execution_count": 165,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counter.most_common(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "blue: 7\n",
      "yellow: 6\n",
      "red: 5\n"
     ]
    }
   ],
   "source": [
    "for color, count in counter.most_common():\n",
    "    print('{0}: {1}'.format(color, count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "# clear counter\n",
    "counter.clear()\n",
    "print(counter['blue'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "compute the word frequencies in \"sentences.txt\"\n",
    "- read sentences from file \"sentences.txt\"\n",
    "- split sentences into words (split)\n",
    "- filter out symbols (isalpha, isdigit, isalnum)\n",
    "- normalize words and count ('Word' and 'word' are considered as the same word)\n",
    "- count the occurance of words (counting exmaple)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "write your code here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('the', 325885),\n",
       " ('of', 160369),\n",
       " ('to', 152595),\n",
       " ('in', 126200),\n",
       " ('and', 114420),\n",
       " ('a', 108957),\n",
       " ('The', 61956),\n",
       " ('is', 58325),\n",
       " ('that', 55133),\n",
       " ('for', 51248),\n",
       " ('s', 40005),\n",
       " ('was', 35760),\n",
       " ('on', 35210),\n",
       " ('are', 33740),\n",
       " ('with', 31171),\n",
       " ('have', 29856),\n",
       " ('said', 26795),\n",
       " ('has', 26629),\n",
       " ('from', 26454),\n",
       " ('it', 26116)]"
      ]
     },
     "execution_count": 169,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wordCounter.most_common(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
diff --git a/practice2-tagcloud.ipynb b/practice2-tagcloud.ipynb
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Python 基本資料處理"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Read file line by line"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sentences = []\n",
	"for line in open('sentences.txt'):\n",
	" # 刪減前後的空白與換行\n",
	" line = line.strip()\n",
	" # 將處理好的字串加入 sentences \n",
	" sentences.append(line)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n"
	]
	}
	],
	"source": [
	"print(sentences[0])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"string index"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### or you can just write"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# list comprehension\n",
	"sentences = [line.strip() for line in open('sentences.txt')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The World Heart Federation in Geneva , Switzerland , reports that the number of overweight and obese people is increasing .\n"
	]
	}
	],
	"source": [
	"print(sentences[0])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### String operations"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sentence = \"I want to eat an apple .\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### string character can be access like list elements"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 54,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'t'"
	]
	},
	"execution_count": 54,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence[5]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"eat\n"
	]
	}
	],
	"source": [
	"sentence[10:13]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 56,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'.'"
	]
	},
	"execution_count": 56,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence[-1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 57,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'eat an appl'"
	]
	},
	"execution_count": 57,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence[10:-3]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### find sequences in string"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 68,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"3"
	]
	},
	"execution_count": 68,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.find('a')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"find from right-hand side"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 67,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"17"
	]
	},
	"execution_count": 67,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.rfind('a')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"find with a starting point"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 71,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"11"
	]
	},
	"execution_count": 71,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.find('a', 4)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"return -1 when not found"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"-1"
	]
	},
	"execution_count": 63,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.find('can')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"combine the use of subsequence and find"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 65,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'want to eat an apple '"
	]
	},
	"execution_count": 65,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence[sentence.find('want to'):sentence.rfind('.')]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### String Normalization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 80,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'I want to eat an apple .'"
	]
	},
	"execution_count": 80,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'i want to eat an apple .'"
	]
	},
	"execution_count": 81,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.lower()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 82,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'I WANT TO EAT AN APPLE .'"
	]
	},
	"execution_count": 82,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.upper()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 83,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'I want to eat an apple .'"
	]
	},
	"execution_count": 83,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.capitalize()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 86,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 86,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'A'.isupper()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 87,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"False"
	]
	},
	"execution_count": 87,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'A'.islower()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 92,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 92,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'apple'.isalpha()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 93,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 93,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'20'.isdigit()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"False"
	]
	},
	"execution_count": 94,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'20.9'.isdigit()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 99,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'20'.isdecimal()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 101,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'furen5566'.isalnum()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### split sentence by blank"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 78,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['I', 'want', 'to', 'eat', 'an', 'apple', '.']"
	]
	},
	"execution_count": 78,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# the result is list of words in the sentence\n",
	"sentence.split(' ')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 102,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 102,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.endswith('.')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"False"
	]
	},
	"execution_count": 103,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sentence.startswith('He wants')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Dictionary examples"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 104,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"book = dict()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 106,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"book['title'] = 'Natural Language Processing with Python'\n",
	"book['author'] = 'Bird, Klein, and Loper'\n",
	"book['year'] = 2009"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 107,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'author': 'Bird, Klein, and Loper',\n",
	" 'title': 'Natural Language Processing with Python',\n",
	" 'year': 2009}"
	]
	},
	"execution_count": 107,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"book"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dict_keys(['year', 'title', 'author'])"
	]
	},
	"execution_count": 112,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"book.keys()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dict_values([2009, 'Natural Language Processing with Python', 'Bird, Klein, and Loper'])"
	]
	},
	"execution_count": 113,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"book.values()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"dict_items([('year', 2009), ('title', 'Natural Language Processing with Python'), ('author', 'Bird, Klein, and Loper')])"
	]
	},
	"execution_count": 114,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"book.items()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"string formatting"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 108,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'"
	]
	},
	"execution_count": 108,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"'{0} is a book written by {1} in {2}'.format(book['title'], book['author'], book['year'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 109,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'Natural Language Processing with Python is a book written by Bird, Klein, and Loper in 2009'"
	]
	},
	"execution_count": 109,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# advanced formatting\n",
	"'{title} is a book written by {author} in {year}'.format(**book)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Counting Example"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 123,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"data = ['red', 'red', 'red', 'red', 'yellow', 'yellow', 'yellow', 'blue', 'blue']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 124,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"counter = dict()\n",
	"for color in data:\n",
	" if color in counter:\n",
	" counter[color] += 1\n",
	" else:\n",
	" counter[color] = 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 125,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'blue': 2, 'red': 4, 'yellow': 3}"
	]
	},
	"execution_count": 125,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### use default dictionary"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 129,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from collections import defaultdict\n",
	"counter = defaultdict(lambda: 0) # default value function is 0\n",
	"counter = defaultdict(int) # default value function is \"int\", which initialize to 0"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 130,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"for color in data:\n",
	" counter[color] += 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 131,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"defaultdict(int, {'blue': 2, 'red': 4, 'yellow': 3})"
	]
	},
	"execution_count": 131,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### use built-in Counter"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 159,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from collections import Counter"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 160,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"counter = Counter(data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 161,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Counter({'blue': 2, 'red': 4, 'yellow': 3})"
	]
	},
	"execution_count": 161,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 162,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"new_data = ['blue', 'red', 'blue', 'yellow', 'blue', 'yellow', 'blue', 'yellow', 'blue']\n",
	"counter.update(new_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 163,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Counter({'blue': 7, 'red': 5, 'yellow': 6})"
	]
	},
	"execution_count": 163,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### most common elements"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 164,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('blue', 7), ('yellow', 6), ('red', 5)]"
	]
	},
	"execution_count": 164,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter.most_common()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 165,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('blue', 7), ('yellow', 6)]"
	]
	},
	"execution_count": 165,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"counter.most_common(2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 166,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"blue: 7\n",
	"yellow: 6\n",
	"red: 5\n"
	]
	}
	],
	"source": [
	"for color, count in counter.most_common():\n",
	" print('{0}: {1}'.format(color, count))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 147,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"0\n"
	]
	}
	],
	"source": [
	"# clear counter\n",
	"counter.clear()\n",
	"print(counter['blue'])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Exercise"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"compute the word frequencies in \"sentences.txt\"\n",
	"- read sentences from file \"sentences.txt\"\n",
	"- split sentences into words (split)\n",
	"- filter out symbols (isalpha, isdigit, isalnum)\n",
	"- normalize words and count ('Word' and 'word' are considered as the same word)\n",
	"- count the occurance of words (counting exmaple)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"write your code here"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 167,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": 169,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('the', 325885),\n",
	" ('of', 160369),\n",
	" ('to', 152595),\n",
	" ('in', 126200),\n",
	" ('and', 114420),\n",
	" ('a', 108957),\n",
	" ('The', 61956),\n",
	" ('is', 58325),\n",
	" ('that', 55133),\n",
	" ('for', 51248),\n",
	" ('s', 40005),\n",
	" ('was', 35760),\n",
	" ('on', 35210),\n",
	" ('are', 33740),\n",
	" ('with', 31171),\n",
	" ('have', 29856),\n",
	" ('said', 26795),\n",
	" ('has', 26629),\n",
	" ('from', 26454),\n",
	" ('it', 26116)]"
	]
	},
	"execution_count": 169,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"wordCounter.most_common(20)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}
No results found