dspp779 · February 20, 2017 06:29
diff --git a/Practice170221-python basic.ipynb b/Practice170221-python basic.ipynb
diff --git a/Practice170221-tokenization and pos tagging.ipynb b/Practice170221-tokenization and pos tagging.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# http://www.bbc.com/news/business-38930699\n",
    "text = open('ticking_spotify.txt').read()\n",
    "sents = [line.strip() for line in text.splitlines()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Tokenization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Using nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "Install nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "!pip3 install nltk -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download tokenization model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('punkt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['It', \"'s\", 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']\n"
     ]
    }
   ],
   "source": [
    "from nltk import word_tokenize\n",
    "print(word_tokenize(sents[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['It', \"'\", 's', 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat', '-', 'rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']\n"
     ]
    }
   ],
   "source": [
    "from nltk import wordpunct_tokenize\n",
    "print(wordpunct_tokenize(sents[1]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter stopwords"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download stopword corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "load stopwords and save it in a stopword set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stopwords = set(stopwords.words('english'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'further', 'off', '@', 'and', '.', 'having', '%', ',', 'for', \"'\", 'needn', 'wasn', 'then', 'did', 'isn', 'didn', 'm', 'if', 'of', 'theirs', ';', 'above', ':', 'll', '?', '&', 'mightn', 'while', 'each', 'wouldn', '#', '(', 'until', '\\\\', 'an', 'ours', 'weren', 'more', 'himself', 'her', 'am', 'what', 'they', 'in', 'its', 'aren', 'there', 'other', 'd', 'own', 't', 'the', 'than', 'no', 'do', 'couldn', '`', 'it', 'has', 'yours', 'because', 'i', 'here', 'why', 'these', 'my', 'who', 'where', 'haven', 'is', 'themselves', 'yourselves', 'through', 'from', 'out', 'very', 'ain', '<', '$', 'to', 'all', '|', 'he', 'been', 'over', 'or', '-', 'will', 'this', 'doing', 'don', 'she', 'that', 'those', 'his', 'our', 'such', 'before', 'between', 'was', 'me', 'be', 'both', 'have', 'shan', '\"', 'can', 'by', 'during', 'below', '^', 'myself', 'at', 'once', ')', '+', 'are', '~', 'ma', 'too', 'hasn', 'so', 'them', 'when', 'now', 'on', 'herself', 'won', '_', 'him', 'most', 'a', 'with', 'just', 'not', '}', 'their', '!', 'which', 'up', '*', '{', 'should', '[', 'y', 'few', 'again', 'only', 're', 'hadn', 've', 'but', '>', 'mustn', 'whom', 'about', 'itself', 'being', 'hers', 'as', 'ourselves', 'some', 'o', 'were', 'any', 'same', 'doesn', 'how', 'we', 'you', 'shouldn', 'your', 'down', 'had', 'does', 'under', 'into', 'yourself', 'against', 'after', 's', '=', ']', '/', 'nor'}\n"
     ]
    }
   ],
   "source": [
    "print(stopwords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "filter example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['It', \"'s\", 'amazing', 'think', '10', 'years', 'ago', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'mere', 'gleam', 'eye', 'industry', 'executives']\n"
     ]
    }
   ],
   "source": [
    "words = word_tokenize(sents[1])\n",
    "# for word in words:\n",
    "#     if word not in stopwords:\n",
    "#         print(word)\n",
    "print([word for word in words if word not in stopwords])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take punctuation into account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import string\n",
    "string.punctuation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add punctuation into stopword set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# you can add one at a time\n",
    "for symbol in string.punctuation:\n",
    "    stopwords.add(symbol)\n",
    "# or update a sequence into the set\n",
    "stopwords.update(string.punctuation)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "source": [
    "### Part-of-speech tagging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download nltk POS tagging model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "nltk.download('averaged_perceptron_tagger')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "words = word_tokenize(sents[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('It', 'PRP'),\n",
       " (\"'s\", 'VBZ'),\n",
       " ('amazing', 'JJ'),\n",
       " ('to', 'TO'),\n",
       " ('think', 'VB'),\n",
       " ('that', 'DT'),\n",
       " ('just', 'RB'),\n",
       " ('10', 'CD'),\n",
       " ('years', 'NNS'),\n",
       " ('ago', 'RB'),\n",
       " (',', ','),\n",
       " ('flat-rate', 'JJ'),\n",
       " ('digital', 'JJ'),\n",
       " ('music', 'NN'),\n",
       " ('streaming', 'NN'),\n",
       " ('services', 'NNS'),\n",
       " ('were', 'VBD'),\n",
       " ('a', 'DT'),\n",
       " ('mere', 'JJ'),\n",
       " ('gleam', 'NN'),\n",
       " ('in', 'IN'),\n",
       " ('the', 'DT'),\n",
       " ('eye', 'NN'),\n",
       " ('of', 'IN'),\n",
       " ('industry', 'NN'),\n",
       " ('executives', 'NNS'),\n",
       " ('.', '.')]"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.pos_tag(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The/DT clock/NN is/VBZ ticking/VBG for/IN Spotify/NNP\n"
     ]
    }
   ],
   "source": [
    "for sent in nltk.pos_tag_sents(word_tokenize(sent) for sent in sents):\n",
    "    print(' '.join('{0}/{1}'.format(word, pos) for word, pos in sent))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# spaCy\n",
    "Industrial-strength Natural Language Processing (NLP) with Python and Cython\n",
    "\n",
    "Homepage: https://spacy.io\n",
    "\n",
    "Github: https://github.com/explosion/spaCy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Install spaCy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip3 install -U spaCy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Download language models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "!python3 -m spacy.en.download all"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using spaCy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load language data might take some time..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import spacy\n",
    "nlp = spacy.load('en')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN \n"
     ]
    }
   ],
   "source": [
    "for t in sents:\n",
    "    doc = nlp(t)\n",
    "    for sent in doc.sents:\n",
    "        for token in sent:\n",
    "            print('{}/{}'.format(token, token.pos_), end=' ')\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN \n",
      "/SPACE \n"
     ]
    }
   ],
   "source": [
    "doc = nlp(text)\n",
    "for sent in doc.sents:\n",
    "    for token in sent:\n",
    "        print('{}/{}'.format(token, token.pos_), end=' ')\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"# http://www.bbc.com/news/business-38930699\n",
	"text = open('ticking_spotify.txt').read()\n",
	"sents = [line.strip() for line in text.splitlines()]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"## Tokenization"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"### Using nltk"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"deletable": true,
	"editable": true
	},
	"source": [
	"Install nltk"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"!pip3 install nltk -U"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download tokenization model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import nltk\n",
	"nltk.download('punkt')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['It', \"'s\", 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']\n"
	]
	}
	],
	"source": [
	"from nltk import word_tokenize\n",
	"print(word_tokenize(sents[1]))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['It', \"'\", 's', 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat', '-', 'rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']\n"
	]
	}
	],
	"source": [
	"from nltk import wordpunct_tokenize\n",
	"print(wordpunct_tokenize(sents[1]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Filter stopwords"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download stopword corpus"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import nltk\n",
	"nltk.download('stopwords')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"load stopwords and save it in a stopword set"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from nltk.corpus import stopwords\n",
	"stopwords = set(stopwords.words('english'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"{'further', 'off', '@', 'and', '.', 'having', '%', ',', 'for', \"'\", 'needn', 'wasn', 'then', 'did', 'isn', 'didn', 'm', 'if', 'of', 'theirs', ';', 'above', ':', 'll', '?', '&', 'mightn', 'while', 'each', 'wouldn', '#', '(', 'until', '\\\\', 'an', 'ours', 'weren', 'more', 'himself', 'her', 'am', 'what', 'they', 'in', 'its', 'aren', 'there', 'other', 'd', 'own', 't', 'the', 'than', 'no', 'do', 'couldn', '`', 'it', 'has', 'yours', 'because', 'i', 'here', 'why', 'these', 'my', 'who', 'where', 'haven', 'is', 'themselves', 'yourselves', 'through', 'from', 'out', 'very', 'ain', '<', '$', 'to', 'all', '\|', 'he', 'been', 'over', 'or', '-', 'will', 'this', 'doing', 'don', 'she', 'that', 'those', 'his', 'our', 'such', 'before', 'between', 'was', 'me', 'be', 'both', 'have', 'shan', '\"', 'can', 'by', 'during', 'below', '^', 'myself', 'at', 'once', ')', '+', 'are', '~', 'ma', 'too', 'hasn', 'so', 'them', 'when', 'now', 'on', 'herself', 'won', '_', 'him', 'most', 'a', 'with', 'just', 'not', '}', 'their', '!', 'which', 'up', '*', '{', 'should', '[', 'y', 'few', 'again', 'only', 're', 'hadn', 've', 'but', '>', 'mustn', 'whom', 'about', 'itself', 'being', 'hers', 'as', 'ourselves', 'some', 'o', 'were', 'any', 'same', 'doesn', 'how', 'we', 'you', 'shouldn', 'your', 'down', 'had', 'does', 'under', 'into', 'yourself', 'against', 'after', 's', '=', ']', '/', 'nor'}\n"
	]
	}
	],
	"source": [
	"print(stopwords)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"filter example"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['It', \"'s\", 'amazing', 'think', '10', 'years', 'ago', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'mere', 'gleam', 'eye', 'industry', 'executives']\n"
	]
	}
	],
	"source": [
	"words = word_tokenize(sents[1])\n",
	"# for word in words:\n",
	"# if word not in stopwords:\n",
	"# print(word)\n",
	"print([word for word in words if word not in stopwords])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Take punctuation into account"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{\|}~'"
	]
	},
	"execution_count": 18,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import string\n",
	"string.punctuation"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Add punctuation into stopword set"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# you can add one at a time\n",
	"for symbol in string.punctuation:\n",
	" stopwords.add(symbol)\n",
	"# or update a sequence into the set\n",
	"stopwords.update(string.punctuation)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true,
	"deletable": true,
	"editable": true
	},
	"source": [
	"### Part-of-speech tagging"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Download nltk POS tagging model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"deletable": true,
	"editable": true
	},
	"outputs": [],
	"source": [
	"import nltk\n",
	"nltk.download('averaged_perceptron_tagger')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 36,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"words = word_tokenize(sents[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 37,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[('It', 'PRP'),\n",
	" (\"'s\", 'VBZ'),\n",
	" ('amazing', 'JJ'),\n",
	" ('to', 'TO'),\n",
	" ('think', 'VB'),\n",
	" ('that', 'DT'),\n",
	" ('just', 'RB'),\n",
	" ('10', 'CD'),\n",
	" ('years', 'NNS'),\n",
	" ('ago', 'RB'),\n",
	" (',', ','),\n",
	" ('flat-rate', 'JJ'),\n",
	" ('digital', 'JJ'),\n",
	" ('music', 'NN'),\n",
	" ('streaming', 'NN'),\n",
	" ('services', 'NNS'),\n",
	" ('were', 'VBD'),\n",
	" ('a', 'DT'),\n",
	" ('mere', 'JJ'),\n",
	" ('gleam', 'NN'),\n",
	" ('in', 'IN'),\n",
	" ('the', 'DT'),\n",
	" ('eye', 'NN'),\n",
	" ('of', 'IN'),\n",
	" ('industry', 'NN'),\n",
	" ('executives', 'NNS'),\n",
	" ('.', '.')]"
	]
	},
	"execution_count": 37,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"nltk.pos_tag(words)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The/DT clock/NN is/VBZ ticking/VBG for/IN Spotify/NNP\n"
	]
	}
	],
	"source": [
	"for sent in nltk.pos_tag_sents(word_tokenize(sent) for sent in sents):\n",
	" print(' '.join('{0}/{1}'.format(word, pos) for word, pos in sent))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# spaCy\n",
	"Industrial-strength Natural Language Processing (NLP) with Python and Cython\n",
	"\n",
	"Homepage: https://spacy.io\n",
	"\n",
	"Github: https://github.com/explosion/spaCy"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Install spaCy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"!pip3 install -U spaCy"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Download language models"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"!python3 -m spacy.en.download all"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Using spaCy"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Load language data might take some time..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 41,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import spacy\n",
	"nlp = spacy.load('en')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 49,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN \n"
	]
	}
	],
	"source": [
	"for t in sents:\n",
	" doc = nlp(t)\n",
	" for sent in doc.sents:\n",
	" for token in sent:\n",
	" print('{}/{}'.format(token, token.pos_), end=' ')\n",
	" print()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 50,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN \n",
	"/SPACE \n"
	]
	}
	],
	"source": [
	"doc = nlp(text)\n",
	"for sent in doc.sents:\n",
	" for token in sent:\n",
	" print('{}/{}'.format(token, token.pos_), end=' ')\n",
	" print()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}
No results found