-
-
Save Sanjogsharma/e1161f02d8e8f0a6ef0c to your computer and use it in GitHub Desktop.
Hw_3_IMDB_DAATA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| Pandas Homework with IMDB data | |
| ''' | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| ''' | |
| BASIC LEVEL | |
| ''' | |
| movies = pd.read_table('imdb_1000.csv', sep = ',') | |
| # read in 'imdb_1000.csv' and store it in a DataFrame named movies | |
| # check the number of rows and columns | |
| movies.shape | |
| # check the data type of each column | |
| movies.dtypes | |
| # calculate the average movie duration | |
| movies.duration.mean() | |
| # sort the DataFrame by duration to find the shortest and longest movies | |
| movies.duration.order() | |
| # 64 shortest; 242 longest | |
| # create a histogram of duration, choosing an "appropriate" number of bins | |
| movies.duration.plot(kind = 'hist', bins = 30) | |
| # use a box plot to display that same data | |
| movies.duration.plot(kind = 'box') | |
| ''' | |
| INTERMEDIATE LEVEL | |
| ''' | |
| # count how many movies have each of the content ratings | |
| movies.content_rating.value_counts() | |
| # use a visualization to display that same data, including a title and x and y labels | |
| movies.groupby('content_rating').count().title.plot(kind = 'bar') | |
| plt.xlabel('no of movies') | |
| plt.ylabel('count') | |
| # convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP | |
| movies.content_rating.replace('NOT RATED', 'UNRATED', inplace = True) | |
| movies.content_rating.replace('APPROVED', 'UNRATED', inplace = True) | |
| movies.content_rating.replace('PASSED', 'UNRATED', inplace = True) | |
| movies.content_rating.replace('GP', 'UNRATED', inplace = True) | |
| # convert the following content ratings to "NC-17": X, TV-MA | |
| movies.content_rating.replace('X', 'NC-17', inplace = True) | |
| movies.content_rating.replace('TV-MA', 'NC-17', inplace = True) | |
| # count the number of missing values in each column | |
| # genre duration actors_list | |
| movies.content_rating.isnull().sum() | |
| movies.star_rating.isnull().sum() | |
| movies.title.isnull().sum() | |
| movies.genre.isnull().sum() | |
| movies.duration.isnull().sum() | |
| movies.actors_list.isnull().sum() | |
| movies.isnull().sum() | |
| # if there are missing values: examine them, then fill them in with "reasonable" values | |
| movies.fillna(value='NOT RATED', inplace=True) | |
| # calculate the average star rating for movies 2 hours or longer, | |
| movies[movies.duration > 120].star_rating.mean() | |
| movies[movies.duration < 120].star_rating.mean() | |
| # and compare that with the average star rating for movies shorter than 2 hours | |
| # use a visualization to detect whether there is a relationship between star rating and duration | |
| movies.plot(kind = 'scatter', x='star_rating', y='duration', alpha = 0.3) | |
| # calculate the average duration for each genre | |
| movies.groupby('genre').duration.mean() | |
| ''' | |
| ADVANCED LEVEL | |
| ''' | |
| # visualize the relationship between content rating and duration | |
| movies.boxplot(column='duration', by='content_rating') | |
| # determine the top rated movie (by star rating) for each genre - | |
| movies.sort('star_rating', ascending = False).groupby('genre')['title','star_rating'].first() | |
| # check if there are multiple movies with the same title, and if so, determine if they are actually duplicates | |
| movies.title.value_counts().head() | |
| # calculate the average star rating for each genre, but only include genres with at least 10 movies | |
| star_rating = movies.groupby('genre').star_rating.agg(['mean']) | |
| genre_counter = movies.groupby('genre').genre.agg(['count']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment