Skip to content

Instantly share code, notes, and snippets.

@JudeWells
Created March 3, 2020 16:06
Show Gist options
  • Select an option

  • Save JudeWells/ffbff06cbee4b9d69e10eaa02e206c2e to your computer and use it in GitHub Desktop.

Select an option

Save JudeWells/ffbff06cbee4b9d69e10eaa02e206c2e to your computer and use it in GitHub Desktop.
articles
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import pandas as pd\n",
"import spacy\n",
"import time\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"df = pd.read_csv('articles.csv')\n",
"original_columns = df.columns\n",
"df.drop('Unnamed: 0',axis=1 ,inplace=True)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"# Why use named entity recogniser?\n",
"\n",
"A pretrained NER uses a combination of regex, hand-written rules, dictionaries and context-aware statistical techniques to do part-of-speech tagging on each token.\n",
"See example below (taken from articles.csv) where the NER correctly extracts the author name which is identified by the 'PERSON' label: "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cherine Fahd: PERSON\n",
"Lecturer Photography: ORG\n",
"School of Design: ORG\n",
"University of Technology Sydney: ORG\n"
]
}
],
"source": [
"# Example of NER\n",
"NER_model = spacy.load(\"en_core_web_sm\")\n",
"author_input_string = 'Cherine Fahd, Senior Lecturer Photography, School of Design, University of Technology Sydney'\n",
"entities = NER_model(author_input_string)\n",
"for ent in entities.ents: \n",
" print(f'{ent.text}: {ent.label_}')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maximum authors for 1 article=7\n",
"702.3755922317505\n"
]
}
],
"source": [
"def extract_people(entities):\n",
" \"\"\"Converts NER object into a list of PEOPLE tokens\"\"\"\n",
" return list(set([ent.text for ent in entities.ents if ent.label_ == 'PERSON' ]))\n",
" \n",
"start = time.time()\n",
"NER_results = df[df.author.notnull()].author.apply(NER_model) # Creates a column of NER objects\n",
"author_series = NER_results.apply(extract_people) # Creates a column with a list of author(s) for each row\n",
"max_authors = max([len(person_list) for person_list in author_series.values])\n",
"print(f'Maximum authors for 1 article={max_authors}')\n",
"columns = ['author'+str(i+1) for i in range(max_authors)]\n",
"extracted_authors_df = pd.DataFrame(author_series.to_list(), columns=columns, index=author_series.index) # Splits list of authors into individual columns\n",
"df = df.join(extracted_authors_df)\n",
"print(time.time()-start)"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"One downside of the NER model is that it is quite slow compared to just using regular expressions. To process 50,000 author strings might take around 12 minutes. We could speed this up by only applying the NER to some of the author_strings."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"# Inspecting a shuffled sample of the data to look for problems\n",
"df.sample(frac= 0.04).to_csv('NER_only_sample2.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"After inspecting the output of the NER model we see that in some cases 'For Mailonline' has been identified as part of the author name by the NER. the cell below uses regex to identify and remove this part of the string from the name."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cleaning column: author1\n",
"Cleaning column: author2\n",
"Cleaning column: author3\n",
"Cleaning column: author4\n",
"Cleaning column: author5\n",
"Cleaning column: author6\n",
"Cleaning column: author7\n"
]
}
],
"source": [
"# Removing mistakes on the NER model where it has captured ' For ...' \n",
"for col in [col for col in df.columns if col not in original_columns]:\n",
" print((f'Cleaning column: {col}'))\n",
" mask = df[df[col].notnull()][col].str.contains(' f|For ', regex=True)\n",
" for i in df[df[col].notnull()][mask].index:\n",
" df.loc[i, col] = re.split(' f|For ', df.loc[i, col])[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Associated+Press, By Associated Press\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Associated+Press, By Associated Press\n",
"BBC News\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Brittany+Valadez+For+Dailymail.com, By Brittany Valadez For Dailymail.com\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Afp, By Afp\n",
"BBC News\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Associated+Press, By Associated Press\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Dailymail.com+Reporter, By Dailymail.com Reporter\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Shivali+Best+For+Mailonline, By Shivali Best For Mailonline\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Press+Association, By Press Association\n",
"nan\n",
"nan\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Afp, By Afp\n",
"Unknown\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Associated+Press, By Associated Press\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Reuters, By Reuters\n",
"Telegraph Reporters\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Associated+Press, By Associated Press\n",
"Anoosh Chakelian\n",
"http://www.dailymail.co.uk/home/search.html?s=&authornamef=Press+Association, By Press Association\n",
"Seamus Duff\n"
]
}
],
"source": [
"# Viewing a sample of entries where no author name was extracted to look for patterns\n",
"for i in range(50):\n",
" if df['author1'].isnull()[i]:\n",
" print(df.loc[i, 'author'])"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"It's clear that the NER model fails to detect an author name where the website is dailymail.co.uk. We can use the text pattern 'By {author name} For...' to extract the name in these cases. Note that in some cases 'For...' is not included and in some cases there are multiple authors."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"# Capturing names with the 'by ... for' split \n",
"for i in df[df.author1.isnull()].index: # Only parse those results where we have not already extracted a name\n",
" auth_str = df.loc[i, 'author']\n",
" try:\n",
" len(auth_str)\n",
" except TypeError:\n",
" continue\n",
" if ' by 'in auth_str.lower():\n",
" auth_str = re.split(' b|By ', auth_str)[1]\n",
" auth_str = re.split(' f|For ', auth_str)[0]\n",
" name_list = re.split(', ', auth_str) # In the case of multiple authors split on commas\n",
" name_list.extend(re.split(' and ', name_list.pop())) # find and remove the word 'and' we can use pop because 'and' will always be before the last author's name\n",
" for j in range(len(name_list)):\n",
" df.loc[i, 'author'+ str(j+1)] = name_list[j] # go through the list of extracted authors and add them to respective columns "
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"The next transformation is optional - it depends on the use case for the data. If we want the data to have high recall when searching for authors, then we might choose to apply it. However, it is likely to also introduce a lot of non-author-name strings into the author columns. So if we just want a clean list of author names we would probably not choose to run it. A compromise would be to use additional regular expressions and dictionaries to further filter the results of this transformation."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"# If no author name has been extraced use whatever is left minus the URL (and split in case of multiple authors)\n",
"url_pattern = re.compile('https?://\\S*, ')\n",
"for i in df[df.author1.isnull()].index:\n",
" auth_str = df.loc[i, 'author']\n",
" try:\n",
" len(auth_str)\n",
" except TypeError:\n",
" continue\n",
"\n",
" text = ''.join(re.split(url_pattern, auth_str)[0]) # remove the URL portion if present\n",
" name_list = re.split(', ', text)\n",
" name_list.extend(re.split(' and ', name_list.pop()))\n",
" for j in range(len(name_list)):\n",
" df.loc[i, 'author'+ str(j+1)] = name_list[j]"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"We may want to also remove initials and title's from the names for example: \n",
"\n",
"Dr. Mell Norman\n",
"Dawn R Bazely\n",
"Regina F. Graham\n",
"\n",
"This will help standardize the representation of names - thereby making them easier to search."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"prefix_title = re.compile('^\\w\\w\\. ') # Removes two letters followed by a period from beginning of string\n",
"middle_initial = re.compile(' \\w\\.? ') # Removes single letter surrounded by space with or without a period\n",
"\n",
"def normalise_names(name_text):\n",
" if type(name_text) != str:\n",
" return None\n",
" removed_prefix = re.sub(prefix_title, '', name_text)\n",
" removed_middle_initial = re.sub(middle_initial, ' ', removed_prefix)\n",
" return removed_middle_initial\n",
"\n",
"\n",
"for col in ['author'+ str(i) for i in range(1,8)]:\n",
" df[col] = df[col].apply(normalise_names)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" type(df.loc[0,'author1']) == str"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"df.to_csv('cleaned_articles.csv')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"df.sample(frac=0.06).to_csv('cleaned_articles_random_sample.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"# Conclusion\n",
"\n",
"We have not been able to achieve 100% accuracy - because there is an inherrent trade off between high recall (not missing any author names where they are present) and high precision (all of the names extracted are genuine names rather than other words).\n",
"\n",
"If we really wanted to improve accuracy we could manually label a few cases where the Spacy NER model has made mistakes and retrain the model using this hand-labelled data. Other possible solutions would be writing dictionaries and regular expressions to catch common errors."
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"# Extracting jsons"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"No actual json data was provided as part of the task specification so I have assumed that the API would return something like below. The existing csv file was split into 501 json files according to this schema."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"button": false,
"collapsed": true,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [],
"source": [
"# Possible structure for API response json files\n",
"{\n",
" 'list_articles':[\n",
" # article 1\n",
" {\n",
" 'author':'Name here',\n",
" 'content': 'This would be the text of the article',\n",
" 'description': 'short descr of article',\n",
" 'published_at': 'timestamp',\n",
" 'source':{\n",
" 'id': None,\n",
" 'name': 'website.co.uk'\n",
" }\n",
" 'title': 'Headline goes here',\n",
" 'url': 'website.co.uk/id',\n",
" 'urlToImage': 'website.co.uk/id/image'\n",
" },\n",
" \n",
" # article 2 \n",
" {\n",
" 'author':'Name here',\n",
" 'content': 'This would be the text of the article',\n",
" 'description': 'short descr of article',\n",
" 'published_at': 'timestamp',\n",
" 'source':{\n",
" 'id': None,\n",
" 'name': 'website.co.uk'\n",
" }\n",
" 'title': 'Headline goes here',\n",
" 'url': 'website.co.uk/id',\n",
" 'urlToImage': 'website.co.uk/id/image'\n",
" },\n",
" # etc. (approx 100 articles in each json file)\n",
"]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"button": false,
"collapsed": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total time (seconds) = 7.7462170124053955\n",
"Len new_df: 49198\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>author</th>\n",
" <th>content</th>\n",
" <th>description</th>\n",
" <th>publishedAt</th>\n",
" <th>source</th>\n",
" <th>title</th>\n",
" <th>url</th>\n",
" <th>urlToImage</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NaN</td>\n",
" <td>'Let's do it!' Karlie Kloss arrives for 2017 V...</td>\n",
" <td>'Let's do it!' Karlie Kloss arrives for 2017 V...</td>\n",
" <td>2017-11-20T04:57:22Z</td>\n",
" <td>{'id': None, 'name': 'Dailymail.co.uk'}</td>\n",
" <td>'Let's do it!' Karlie Kloss at Victoria's Secr...</td>\n",
" <td>http://www.dailymail.co.uk/video/tvshowbiz/vid...</td>\n",
" <td>http://i.dailymail.co.uk/i/pix/2017/11/20/05/4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>They're back! Julia Morris and Chris Brown ret...</td>\n",
" <td>2017-11-20T04:57:10Z</td>\n",
" <td>{'id': None, 'name': 'Dailymail.co.uk'}</td>\n",
" <td>They're back! Julia Morris and Chris Brown ret...</td>\n",
" <td>http://www.dailymail.co.uk/video/tvshowbiz/vid...</td>\n",
" <td>http://i.dailymail.co.uk/i/pix/2017/11/20/04/4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.dailymail.co.uk/home/search.html?s=...</td>\n",
" <td>The 93-year-old president is facing a barrage ...</td>\n",
" <td>Robert Mugabe faced the threat of impeachment ...</td>\n",
" <td>2017-11-20T04:56:14Z</td>\n",
" <td>{'id': None, 'name': 'Dailymail.co.uk'}</td>\n",
" <td>Noon deadline looms for defiant Mugabe as Zimb...</td>\n",
" <td>http://www.dailymail.co.uk/wires/afp/article-5...</td>\n",
" <td>http://i.dailymail.co.uk/1/2017/11/20/04/wire-...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>Artists Kelly Clarkson &amp; P!nk perform 'everybo...</td>\n",
" <td>2017-11-20T04:56:04Z</td>\n",
" <td>{'id': None, 'name': 'Dailymail.co.uk'}</td>\n",
" <td>Kelly Clarkson &amp; Pink sing 'everybody hurts' a...</td>\n",
" <td>http://www.dailymail.co.uk/video/tvshowbiz/vid...</td>\n",
" <td>http://i.dailymail.co.uk/i/pix/2017/11/20/05/4...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>A freshly blonde Selena Gomez performs her new...</td>\n",
" <td>2017-11-20T04:55:59Z</td>\n",
" <td>{'id': None, 'name': 'Dailymail.co.uk'}</td>\n",
" <td>A blonde Selena Gomez performs 'Wolves' at the...</td>\n",
" <td>http://www.dailymail.co.uk/video/tvshowbiz/vid...</td>\n",
" <td>http://i.dailymail.co.uk/i/pix/2017/11/20/05/4...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" author \\\n",
"0 NaN \n",
"1 NaN \n",
"2 http://www.dailymail.co.uk/home/search.html?s=... \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" content \\\n",
"0 'Let's do it!' Karlie Kloss arrives for 2017 V... \n",
"1 NaN \n",
"2 The 93-year-old president is facing a barrage ... \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" description publishedAt \\\n",
"0 'Let's do it!' Karlie Kloss arrives for 2017 V... 2017-11-20T04:57:22Z \n",
"1 They're back! Julia Morris and Chris Brown ret... 2017-11-20T04:57:10Z \n",
"2 Robert Mugabe faced the threat of impeachment ... 2017-11-20T04:56:14Z \n",
"3 Artists Kelly Clarkson & P!nk perform 'everybo... 2017-11-20T04:56:04Z \n",
"4 A freshly blonde Selena Gomez performs her new... 2017-11-20T04:55:59Z \n",
"\n",
" source \\\n",
"0 {'id': None, 'name': 'Dailymail.co.uk'} \n",
"1 {'id': None, 'name': 'Dailymail.co.uk'} \n",
"2 {'id': None, 'name': 'Dailymail.co.uk'} \n",
"3 {'id': None, 'name': 'Dailymail.co.uk'} \n",
"4 {'id': None, 'name': 'Dailymail.co.uk'} \n",
"\n",
" title \\\n",
"0 'Let's do it!' Karlie Kloss at Victoria's Secr... \n",
"1 They're back! Julia Morris and Chris Brown ret... \n",
"2 Noon deadline looms for defiant Mugabe as Zimb... \n",
"3 Kelly Clarkson & Pink sing 'everybody hurts' a... \n",
"4 A blonde Selena Gomez performs 'Wolves' at the... \n",
"\n",
" url \\\n",
"0 http://www.dailymail.co.uk/video/tvshowbiz/vid... \n",
"1 http://www.dailymail.co.uk/video/tvshowbiz/vid... \n",
"2 http://www.dailymail.co.uk/wires/afp/article-5... \n",
"3 http://www.dailymail.co.uk/video/tvshowbiz/vid... \n",
"4 http://www.dailymail.co.uk/video/tvshowbiz/vid... \n",
"\n",
" urlToImage \n",
"0 http://i.dailymail.co.uk/i/pix/2017/11/20/05/4... \n",
"1 http://i.dailymail.co.uk/i/pix/2017/11/20/04/4... \n",
"2 http://i.dailymail.co.uk/1/2017/11/20/04/wire-... \n",
"3 http://i.dailymail.co.uk/i/pix/2017/11/20/05/4... \n",
"4 http://i.dailymail.co.uk/i/pix/2017/11/20/05/4... "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Parse all json files into a single dataframe\n",
"start = time.time()\n",
"with open('json/001all_articles.json') as json_file:\n",
" json_data = json.load(json_file)\n",
" columns = json_data['list_articles'][0].keys()\n",
" new_df = pd.DataFrame(columns=columns)\n",
"\n",
"for filepath in os.listdir('json'):\n",
" if '.json'not in filepath:\n",
" continue\n",
" with open('json/' + filepath) as json_file:\n",
" json_data = json.load(json_file)\n",
" new_df = new_df.append(json_data['list_articles'], ignore_index=True)\n",
"\n",
"print(f\"Total time (seconds) = {time.time() - start}\")\n",
"print(f'Len new_df: {len(new_df)}')\n",
"new_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"button": false,
"deletable": true,
"new_sheet": false,
"run_control": {
"read_only": false
}
},
"source": [
"This approach parsed all the jsons in around 10 seconds"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment