Sanjogsharma · June 22, 2015 22:47
diff --git a/hw_3 b/hw_3
 '''
 Pandas Homework with IMDB data
 '''
 import pandas as pd
 import matplotlib.pyplot as plt
 '''
 BASIC LEVEL
 '''

 movies = pd.read_table('imdb_1000.csv', sep = ',')

 # read in 'imdb_1000.csv' and store it in a DataFrame named movies

 # check the number of rows and columns
 movies.shape
 # check the data type of each column
 movies.dtypes
 # calculate the average movie duration
 movies.duration.mean()
 # sort the DataFrame by duration to find the shortest and longest movies
 movies.duration.order()
 # 64 shortest; 242 longest
 # create a histogram of duration, choosing an "appropriate" number of bins
 movies.duration.plot(kind = 'hist', bins = 30)
 # use a box plot to display that same data
 movies.duration.plot(kind = 'box')
 '''
 INTERMEDIATE LEVEL
 '''

 # count how many movies have each of the content ratings

 movies.content_rating.value_counts()
 # use a visualization to display that same data, including a title and x and y labels
 movies.groupby('content_rating').count().title.plot(kind = 'bar')
 plt.xlabel('no of movies')
 plt.ylabel('count')
 # convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP

 movies.content_rating.replace('NOT RATED', 'UNRATED', inplace = True)
 movies.content_rating.replace('APPROVED', 'UNRATED', inplace = True)
 movies.content_rating.replace('PASSED', 'UNRATED', inplace = True)
 movies.content_rating.replace('GP', 'UNRATED', inplace = True)
 # convert the following content ratings to "NC-17": X, TV-MA


 movies.content_rating.replace('X', 'NC-17', inplace = True)
 movies.content_rating.replace('TV-MA', 'NC-17', inplace = True)

 # count the number of missing values in each column 
 # genre  duration  actors_list
 movies.content_rating.isnull().sum() 
 movies.star_rating.isnull().sum()
 movies.title.isnull().sum()
 movies.genre.isnull().sum()
 movies.duration.isnull().sum()
 movies.actors_list.isnull().sum()

 movies.isnull().sum()

 # if there are missing values: examine them, then fill them in with "reasonable" values
 movies.fillna(value='NOT RATED', inplace=True)
 # calculate the average star rating for movies 2 hours or longer,

 movies[movies.duration > 120].star_rating.mean()

 movies[movies.duration < 120].star_rating.mean()
 # and compare that with the average star rating for movies shorter than 2 hours

 # use a visualization to detect whether there is a relationship between star rating and duration
 movies.plot(kind = 'scatter', x='star_rating', y='duration', alpha = 0.3)
 # calculate the average duration for each genre


 movies.groupby('genre').duration.mean()
 '''
 ADVANCED LEVEL
 '''

 # visualize the relationship between content rating and duration
 movies.boxplot(column='duration', by='content_rating')
 # determine the top rated movie (by star rating) for each genre - 

 movies.sort('star_rating', ascending = False).groupby('genre')['title','star_rating'].first()
 # check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
 movies.title.value_counts().head()

 # calculate the average star rating for each genre, but only include genres with at least 10 movies

 star_rating = movies.groupby('genre').star_rating.agg(['mean'])
 genre_counter = movies.groupby('genre').genre.agg(['count'])
	'''
	Pandas Homework with IMDB data
	'''
	import pandas as pd
	import matplotlib.pyplot as plt
	'''
	BASIC LEVEL
	'''

	movies = pd.read_table('imdb_1000.csv', sep = ',')

	# read in 'imdb_1000.csv' and store it in a DataFrame named movies

	# check the number of rows and columns
	movies.shape
	# check the data type of each column
	movies.dtypes
	# calculate the average movie duration
	movies.duration.mean()
	# sort the DataFrame by duration to find the shortest and longest movies
	movies.duration.order()
	# 64 shortest; 242 longest
	# create a histogram of duration, choosing an "appropriate" number of bins
	movies.duration.plot(kind = 'hist', bins = 30)
	# use a box plot to display that same data
	movies.duration.plot(kind = 'box')
	'''
	INTERMEDIATE LEVEL
	'''

	# count how many movies have each of the content ratings

	movies.content_rating.value_counts()
	# use a visualization to display that same data, including a title and x and y labels
	movies.groupby('content_rating').count().title.plot(kind = 'bar')
	plt.xlabel('no of movies')
	plt.ylabel('count')
	# convert the following content ratings to "UNRATED": NOT RATED, APPROVED, PASSED, GP

	movies.content_rating.replace('NOT RATED', 'UNRATED', inplace = True)
	movies.content_rating.replace('APPROVED', 'UNRATED', inplace = True)
	movies.content_rating.replace('PASSED', 'UNRATED', inplace = True)
	movies.content_rating.replace('GP', 'UNRATED', inplace = True)
	# convert the following content ratings to "NC-17": X, TV-MA


	movies.content_rating.replace('X', 'NC-17', inplace = True)
	movies.content_rating.replace('TV-MA', 'NC-17', inplace = True)

	# count the number of missing values in each column
	# genre duration actors_list
	movies.content_rating.isnull().sum()
	movies.star_rating.isnull().sum()
	movies.title.isnull().sum()
	movies.genre.isnull().sum()
	movies.duration.isnull().sum()
	movies.actors_list.isnull().sum()

	movies.isnull().sum()

	# if there are missing values: examine them, then fill them in with "reasonable" values
	movies.fillna(value='NOT RATED', inplace=True)
	# calculate the average star rating for movies 2 hours or longer,

	movies[movies.duration > 120].star_rating.mean()

	movies[movies.duration < 120].star_rating.mean()
	# and compare that with the average star rating for movies shorter than 2 hours

	# use a visualization to detect whether there is a relationship between star rating and duration
	movies.plot(kind = 'scatter', x='star_rating', y='duration', alpha = 0.3)
	# calculate the average duration for each genre


	movies.groupby('genre').duration.mean()
	'''
	ADVANCED LEVEL
	'''

	# visualize the relationship between content rating and duration
	movies.boxplot(column='duration', by='content_rating')
	# determine the top rated movie (by star rating) for each genre -

	movies.sort('star_rating', ascending = False).groupby('genre')['title','star_rating'].first()
	# check if there are multiple movies with the same title, and if so, determine if they are actually duplicates
	movies.title.value_counts().head()

	# calculate the average star rating for each genre, but only include genres with at least 10 movies

	star_rating = movies.groupby('genre').star_rating.agg(['mean'])
	genre_counter = movies.groupby('genre').genre.agg(['count'])
No results found