Skip to content

Instantly share code, notes, and snippets.

@Sanjogsharma
Created June 22, 2015 22:47
Show Gist options
  • Select an option

  • Save Sanjogsharma/e1161f02d8e8f0a6ef0c to your computer and use it in GitHub Desktop.

Select an option

Save Sanjogsharma/e1161f02d8e8f0a6ef0c to your computer and use it in GitHub Desktop.
Hw_3_IMDB_DAATA
'''
Pandas Homework with IMDB data
'''
import pandas as pd
import matplotlib.pyplot as plt
'''
BASIC LEVEL
'''
movies = pd.read_table('imdb_1000.csv', sep = ',')
# read in 'imdb_1000.csv' and store it in a DataFrame named movies
# check the number of rows and columns
movies.shape
# check the data type of each column
movies.dtypes
# calculate the average movie duration
movies.duration.mean()
# sort the DataFrame by duration to find the shortest and longest movies
movies.duration.order()
# 64 shortest; 242 longest
# create a histogram of duration, choosing an "appropriate" number of bins
movies.duration.plot(kind = 'hist', bins = 30)
# use a box plot to display that same data
movies.duration.plot(kind = 'box')
'''
INTERMEDIATE LEVEL
'''
# count how many movies have each of the content ratings
movies.content_rating.value_counts()
# use a visualization to display that same data, including a title and x and y labels
movies.groupby('content_rating').count().title.plot(kind = 'bar')
plt.xlabel('no of movies')
plt.ylabel('count')
# convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP
movies.content_rating.replace('NOT RATED', 'UNRATED', inplace = True)
movies.content_rating.replace('APPROVED', 'UNRATED', inplace = True)
movies.content_rating.replace('PASSED', 'UNRATED', inplace = True)
movies.content_rating.replace('GP', 'UNRATED', inplace = True)
# convert the following content ratings to "NC-17": X, TV-MA
movies.content_rating.replace('X', 'NC-17', inplace = True)
movies.content_rating.replace('TV-MA', 'NC-17', inplace = True)
# count the number of missing values in each column
# genre duration actors_list
movies.content_rating.isnull().sum()
movies.star_rating.isnull().sum()
movies.title.isnull().sum()
movies.genre.isnull().sum()
movies.duration.isnull().sum()
movies.actors_list.isnull().sum()
movies.isnull().sum()
# if there are missing values: examine them, then fill them in with "reasonable" values
movies.fillna(value='NOT RATED', inplace=True)
# calculate the average star rating for movies 2 hours or longer,
movies[movies.duration > 120].star_rating.mean()
movies[movies.duration < 120].star_rating.mean()
# and compare that with the average star rating for movies shorter than 2 hours
# use a visualization to detect whether there is a relationship between star rating and duration
movies.plot(kind = 'scatter', x='star_rating', y='duration', alpha = 0.3)
# calculate the average duration for each genre
movies.groupby('genre').duration.mean()
'''
ADVANCED LEVEL
'''
# visualize the relationship between content rating and duration
movies.boxplot(column='duration', by='content_rating')
# determine the top rated movie (by star rating) for each genre -
movies.sort('star_rating', ascending = False).groupby('genre')['title','star_rating'].first()
# check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
movies.title.value_counts().head()
# calculate the average star rating for each genre, but only include genres with at least 10 movies
star_rating = movies.groupby('genre').star_rating.agg(['mean'])
genre_counter = movies.groupby('genre').genre.agg(['count'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment