-
-
Save jiahao87/c97214065f996b76ab8fe4ca1964b2b5 to your computer and use it in GitHub Desktop.
| import pandas as pd | |
| import numpy as np | |
| import matplotlib | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import missingno | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| %matplotlib inline | |
| def time_series_plot(df): | |
| """Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency""" | |
| print("\nTo check time series of numeric data by daily, monthly and yearly frequency") | |
| if len(df.select_dtypes(include='datetime64').columns)>0: | |
| for col in df.select_dtypes(include='datetime64').columns: | |
| for p in ['D', 'M', 'Y']: | |
| if p=='D': | |
| print("Plotting daily data") | |
| elif p=='M': | |
| print("Plotting monthly data") | |
| else: | |
| print("Plotting yearly data") | |
| for col_num in df.select_dtypes(include=np.number).columns: | |
| __ = df.copy() | |
| __ = __.set_index(col) | |
| __T = __.resample(p).sum() | |
| ax = __T[[col_num]].plot() | |
| ax.set_ylim(bottom=0) | |
| ax.get_yaxis().set_major_formatter( | |
| matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) | |
| plt.show() | |
| def numeric_eda(df, hue=None): | |
| """Given dataframe, generate EDA of numeric data""" | |
| print("\nTo check: \nDistribution of numeric data") | |
| display(df.describe().T) | |
| columns = df.select_dtypes(include=np.number).columns | |
| figure = plt.figure(figsize=(20, 10)) | |
| figure.add_subplot(1, len(columns), 1) | |
| for index, col in enumerate(columns): | |
| if index > 0: | |
| figure.add_subplot(1, len(columns), index + 1) | |
| sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'}) | |
| figure.tight_layout() | |
| plt.show() | |
| if len(df.select_dtypes(include='category').columns) > 0: | |
| for col_num in df.select_dtypes(include=np.number).columns: | |
| for col in df.select_dtypes(include='category').columns: | |
| fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2) | |
| fig.set_xticklabels(rotation=90) | |
| plt.show() | |
| # Plot the pairwise joint distributions | |
| print("\nTo check pairwise joint distribution of numeric data") | |
| if hue==None: | |
| sns.pairplot(df.select_dtypes(include=np.number)) | |
| else: | |
| sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue) | |
| plt.show() | |
| def top5(df): | |
| """Given dataframe, generate top 5 unique values for non-numeric data""" | |
| columns = df.select_dtypes(include=['object', 'category']).columns | |
| for col in columns: | |
| print("Top 5 unique values of " + col) | |
| print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[ | |
| :min(5, len(df[col].value_counts()))]) | |
| print(" ") | |
| def categorical_eda(df, hue=None): | |
| """Given dataframe, generate EDA of categorical data""" | |
| print("\nTo check: \nUnique count of non-numeric data\n") | |
| print(df.select_dtypes(include=['object', 'category']).nunique()) | |
| top5(df) | |
| # Plot count distribution of categorical data | |
| for col in df.select_dtypes(include='category').columns: | |
| fig = sns.catplot(x=col, kind="count", data=df, hue=hue) | |
| fig.set_xticklabels(rotation=90) | |
| plt.show() | |
| def eda(df): | |
| """Given dataframe, generate exploratory data analysis""" | |
| # check that input is pandas dataframe | |
| if type(df) != pd.core.frame.DataFrame: | |
| raise TypeError("Only pandas dataframe is allowed as input") | |
| # replace field that's entirely space (or empty) with NaN | |
| df = df.replace(r'^\s*$', np.nan, regex=True) | |
| print("Preview of data:") | |
| display(df.head(3)) | |
| print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n") | |
| print(df.info()) | |
| # generate preview of entries with null values | |
| if df.isnull().any(axis=None): | |
| print("\nPreview of data with null values:") | |
| display(df[df.isnull().any(axis=1)].head(3)) | |
| missingno.matrix(df) | |
| plt.show() | |
| # generate count statistics of duplicate entries | |
| if len(df[df.duplicated()]) > 0: | |
| print("\n***Number of duplicated entries: ", len(df[df.duplicated()])) | |
| display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head()) | |
| else: | |
| print("\nNo duplicated entries found") | |
| # EDA of categorical data | |
| categorical_eda(df) | |
| # EDA of numeric data | |
| numeric_eda(df) | |
| # Plot time series plot of numeric data | |
| time_series_plot(df) |
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntaxand also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not definedI am using python 3.8 and all the imported modules are pip installed.
Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntax
and also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not defined
I am using python 3.8 and all the imported modules are pip installed.Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.
You are right. Thank you for the clarification.
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntaxand also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not definedI am using python 3.8 and all the imported modules are pip installed.