1vth1nk3r · December 10, 2021 08:28
diff --git a/dtc: Improving Data Visualizations 0 b/dtc: Improving Data Visualizations 0
 f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 15))

 sns.barplot('people_per_market', 'state', hue = 'is_selected',
            dodge = False, data = markets_by_state, ax = ax1)
 sns.scatterplot('log_pop', 'log_markets', hue = 'is_selected', 
                data = markets_by_state, ax = ax2, s = 100)
 ax1.legend_.remove()
 ax2.legend_.remove() 


 sns.set_style('whitegrid')
 plt.scatter('good','prop selling', marker = '_', alpha = 0.7, data = goods_by_state)
 highlighted = goods_by_state.query("state in ['New Mexico','North Dakota','Vermont']")
 sns.lineplot('good','prop selling', 'state', data = highlighted, legend = False)
 last_rows = highlighted.groupby('state', as_index = False).agg('first')
 for _,row in last_rows.iterrows():
    plt.annotate(row['state'], (row['good'], row['prop selling']),
                 ha = 'right', xytext = (5,0), textcoords = 'offset pixels')
 sns.despine(bottom = True, left = True)


 sns.set(font_scale = 0.85)
 blue_pal = sns.light_palette("steelblue", as_cmap = True)
 g = sns.heatmap(markets_by_month.reindex(state_by_lat),
            linewidths = 0.1,
            cmap = blue_pal, cbar = False,
            yticklabels = True)
 g.set_yticklabels(g.get_yticklabels(), rotation = 0)
 plt.title('Distribution of months open for farmers markets by latitude')
 plt.show()



 sns.barplot('people_per_market', 'state', palette = state_colors,
            data = markets_by_state, ax = ax1)
 p = sns.scatterplot('population', 'num_markets', color = state_colors,
                    data = markets_by_state, s = 60, ax = ax2)
 ax2.set(xscale = "log", yscale = 'log')
 ax2.annotate(tx_message, xy = (26956958,230), 
             xytext = (26956958, 450),ha = 'right', 
             size = 15, backgroundcolor = 'white',
             arrowprops = {'facecolor':'black', 'width': 3})
 sns.set_style('whitegrid')
diff --git a/dtc: Improving Data Visualizations 1 b/dtc: Improving Data Visualizations 1
 houston_pollution = pollution[pollution.city  ==  'Houston']
 houston_colors = ['orangered' if (day  ==  330) & (year  ==  2014) else 'lightgray' 
                  for day,year in zip(houston_pollution.day, houston_pollution.year)]
 sns.regplot(x = 'NO2', y = 'SO2', data = houston_pollution, fit_reg = False, 
            scatter_kws = {'facecolors': houston_colors, 'alpha': 0.7})


 houston_pollution = pollution[pollution.city  ==  'Houston'].copy()
 max_O3 = houston_pollution.O3.max()
 houston_pollution['point_type'] = ['Highest O3 Day' if O3  ==  max_O3 else 'Others' for O3 in houston_pollution.O3]
 sns.scatterplot(x = 'NO2', y = 'SO2', hue = 'point_type', data = houston_pollution)


 sns.kdeplot(pollution[pollution.year == 2012].O3, shade = True, label = '2012')
 sns.kdeplot(pollution[pollution.year != 2012].O3, shade = True, label = 'other years')


 sns.distplot(pollution[pollution.city == 'Vandenberg Air Force Base'].O3, label = 'Vandenberg', 
             hist = False, color = 'steelblue', rug = True)


 pollution_mar = pollution[pollution.month == 3]
 sns.swarmplot(y = "city", x = 'O3', data = pollution_mar, size = 3)


diff --git a/dtc: Improving Data Visualizations 2 b/dtc: Improving Data Visualizations 2
 plt.text(0.57, 41, 'Cincinnati had highest observed\nSO2 value on Aug 11, 2013', 
         fontdict = {'ha': 'left', 'size': 'large'})


 plt.annotate('Long Beach New Years', xy = (lb_newyears['CO'], lb_newyears['NO2']), xytext = (2, 15), 
             arrowprops = {'facecolor':'gray', 'width': 3, 'shrink': 0.03}, backgroundcolor = 'white')
             
            
 is_lb = ['orangered' if city  ==  'Long Beach' else 'lightgray' for city in pollution['city']]
 sns.regplot(x = 'CO', y = 'O3', data = pollution, fit_reg = False,
            scatter_kws = {'facecolors':is_lb, 'alpha': 0.3})
diff --git a/dtc: Improving Data Visualizations 3 b/dtc: Improving Data Visualizations 3
 g = sns.FacetGrid(data = pollution, col = 'city',  col_wrap = 3)
 g.map(sns.scatterplot, 'CO', 'NO2', alpha = 0.2)

 sns.barplot(y = 'city', x = 'CO', estimator = np.mean, ci = False, data = pollution, edgecolor = 'black')
 sns.barplot(y = 'city', x = 'CO', estimator = np.mean, ci = False, data = pollution, color = 'cadetblue')

 color_palette = sns.light_palette('orangered', as_cmap = True)
 sns.scatterplot(x = 'CO', y = 'NO2', hue = 'O3', data = cinci_2014, palette = color_palette)

 color_palette = sns.diverging_palette(250, 0, as_cmap = True)
 sns.heatmap(nov_2015_CO, cmap = color_palette, center = 0, vmin = -4, vmax = 4)

 plt.style.use("dark_background")
 color_palette = sns.diverging_palette(250, 0, center = 'dark', as_cmap = True)
 sns.heatmap(oct_2015_o3, cmap = color_palette, center = 0)


 sns.lineplot(x = "day", y = "CO", hue = "city", palette = "Set2", linewidth = 3, data = pollution_jan13)


 wanted_combos = ['Vandenberg Air Force Base NO2', 'Long Beach CO', 'Cincinnati SO2']
 city_pol_month['color_cats'] = [x if x in wanted_combos else 'other' for x in city_pol_month['city_pol']]
 sns.lineplot(x = "month", y = "value", hue = 'color_cats', units = 'city_pol', estimator = None, 
            palette = 'Set2', data = city_pol_month)
            

 pollution['CO quartile'] = pd.qcut(pollution['CO'], q = 4, labels = False)
 des_moines = pollution.query("city  ==  'Des Moines'")
 sns.scatterplot(x = 'SO2', y = 'NO2', hue = 'CO quartile', data = des_moines, palette = 'GnBu')


 sns.catplot(x = 'city', hue = 'year', y = 'value', row = 'pollutant',    
              data = city_maxes, palette = 'BuGn', sharey = False, kind = 'bar')

diff --git a/dtc: Improving Data Visualizations 4 b/dtc: Improving Data Visualizations 4
 average_ests['lower'] = average_ests['mean'] - 1.96*average_ests['std_err']
 average_ests['upper'] = average_ests['mean'] + 1.96*average_ests['std_err']
 g = sns.FacetGrid(average_ests, row = 'pollutant', sharex = False)
 g.map(plt.hlines, 'y', 'lower', 'upper')
 g.map(plt.scatter, 'seen', 'y', color = 'orangered').set_ylabels('').set_xlabels('') 


 plt.hlines(y = 'year', xmin = 'lower', xmax = 'upper', linewidth = 5, color = 'steelblue', 
           alpha = 0.7, data = diffs_by_year)
 plt.plot('mean', 'year', 'k|', data = diffs_by_year)
 plt.axvline(x = 0, color = 'orangered', linestyle = '--')


 vandenberg_NO2['lower'] = vandenberg_NO2['mean'] - 2.58*vandenberg_NO2['std_err']
 vandenberg_NO2['upper'] = vandenberg_NO2['mean'] + 2.58*vandenberg_NO2['std_err']
 plt.plot('day', 'mean', data = vandenberg_NO2, color = 'white', alpha = 0.4)
 plt.fill_between(x = 'day', y1 = 'lower', y2 = 'upper', data = vandenberg_NO2)


 g = sns.FacetGrid(eastern_SO2, col = 'city', col_wrap = 2)
 g.map(plt.fill_between, 'day', 'lower', 'upper', color = 'coral')
 g.map(plt.plot, 'day', 'mean', color = 'white')


 for city, color in [('Denver',"#66c2a5"), ('Long Beach', "#fc8d62")]:
    city_data = SO2_compare[SO2_compare.city  ==  city]
    plt.fill_between(x = 'day', y1 = 'lower', y2 = 'upper', data = city_data, color = color, alpha = 0.4)
    plt.plot('day','mean', data = city_data, label = city, color = color, alpha = 0.25)


 alphas = [     0.01,  0.05,   0.1] 
 widths = [ '99% CI', '95%', '90%']
 colors = ['#fee08b','#fc8d59','#d53e4f']
 for alpha, color, width in zip(alphas, colors, widths):
    conf_ints = pollution_model.conf_int(alpha)
    plt.hlines(y = conf_ints.index, xmin = conf_ints[0], xmax = conf_ints[1],
               colors = color, label = width, linewidth = 10) 
 plt.plot(pollution_model.params, pollution_model.params.index, 'wo', label = 'Point Estimate')


 int_widths = ['90%', '99%']
 z_scores = [1.67, 2.58]
 colors = ['#fc8d59', '#fee08b']
 for percent, Z, color in zip(int_widths, z_scores, colors):
    plt.fill_between(
        x = cinci_13_no2.day, alpha = 0.4, color = color,
        y1 = cinci_13_no2['mean'] - Z*cinci_13_no2['std_err'],
        y2 = cinci_13_no2['mean'] + Z*cinci_13_no2['std_err'],
        label = percent)


 sizes =      [    15,  10,  5]
 int_widths = ['90% CI', '95%', '99%']
 z_scores =   [    1.67,  1.96,  2.58]
 for percent, Z, size in zip(int_widths, z_scores, sizes):
    plt.hlines(y = rocket_model.pollutant, 
               xmin = rocket_model['est'] - Z*rocket_model['std_err'],
               xmax = rocket_model['est'] + Z*rocket_model['std_err'],
               label = percent, 
               linewidth = size, 
               color = 'gray') 
 plt.plot('est', 'pollutant', 'wo', data = rocket_model, label = 'Point Estimate')
 plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5))


 cinci_may_NO2 = pollution.query("city  ==  'Cincinnati' & month  ==  5").NO2
 boot_means = bootstrap(cinci_may_NO2, 1000)
 lower, upper = np.percentile(boot_means, [2.5, 97.5])
 plt.axvspan(lower, upper, color = 'gray', alpha = 0.2)
 sns.distplot(boot_means, bins = 100, kde = False)


 sns.lmplot('NO2', 'SO2', data = no2_so2_boot,
           hue = 'sample', 
           line_kws = {'color': 'steelblue', 'alpha': 0.2},
           ci = None, legend = False, scatter = False)
 plt.scatter('NO2', 'SO2', data = no2_so2)


 city_boots = pd.DataFrame()
 for city in ['Cincinnati', 'Des Moines', 'Indianapolis', 'Houston']:
    city_NO2 = pollution_may[pollution_may.city  ==  city].NO2
    cur_boot = pd.DataFrame({'NO2_avg': bootstrap(city_NO2, 100), 'city':city})
    city_boots = pd.concat([city_boots,cur_boot])
 sns.swarmplot(y = "city", x = "NO2_avg", data = city_boots, color = 'coral')
diff --git a/dtc: Improving Data Visualizations 5 b/dtc: Improving Data Visualizations 5
 first_rows = markets.head(3).transpose()

 col_descriptions = markets.describe(include = 'all',
                                percentiles = [0.5]).transpose()
                                
 pd.plotting.scatter_matrix(markets[numeric_columns], figsize = (15,10), alpha = 0.5)                                

 markets['log_pop'] = np.log(markets['state_pop'])
 sns.scatterplot(x = 'log_pop', y = 'num_items_sold', alpha = 0.25, data = markets)


 sns.regplot(x = 'lat', y = 'months_open', ci = False, data = markets,
            scatter_kws = {'alpha':0.1, 'color':'gray'})


 g = sns.regplot("log_markets", "log_pop", ci = False, scatter_kws = {'s':2},data = markets_and_pop)
 for _, row in markets_and_pop.iterrows():
    state, _, _, log_markets, log_pop = row
    g.annotate(state, (log_markets,log_pop), size=10)


 to_plot = ['Cheese','Maple','Fruits','Grains','Seafood','Plants','Vegetables']
 goods_by_state_small = goods_by_state.query("good in "+str(to_plot))
 g = sns.scatterplot('good','prop_selling', data = goods_by_state_small, s = 0)
 for _,row in goods_by_state_small.iterrows():
  g.annotate(row['state'], (row['good'], row['prop_selling']), ha = 'center', size = 10)
	f, (ax1, ax2) = plt.subplots(1, 2, figsize = (15, 15))

	sns.barplot('people_per_market', 'state', hue = 'is_selected',
	dodge = False, data = markets_by_state, ax = ax1)
	sns.scatterplot('log_pop', 'log_markets', hue = 'is_selected',
	data = markets_by_state, ax = ax2, s = 100)
	ax1.legend_.remove()
	ax2.legend_.remove()


	sns.set_style('whitegrid')
	plt.scatter('good','prop selling', marker = '_', alpha = 0.7, data = goods_by_state)
	highlighted = goods_by_state.query("state in ['New Mexico','North Dakota','Vermont']")
	sns.lineplot('good','prop selling', 'state', data = highlighted, legend = False)
	last_rows = highlighted.groupby('state', as_index = False).agg('first')
	for _,row in last_rows.iterrows():
	plt.annotate(row['state'], (row['good'], row['prop selling']),
	ha = 'right', xytext = (5,0), textcoords = 'offset pixels')
	sns.despine(bottom = True, left = True)


	sns.set(font_scale = 0.85)
	blue_pal = sns.light_palette("steelblue", as_cmap = True)
	g = sns.heatmap(markets_by_month.reindex(state_by_lat),
	linewidths = 0.1,
	cmap = blue_pal, cbar = False,
	yticklabels = True)
	g.set_yticklabels(g.get_yticklabels(), rotation = 0)
	plt.title('Distribution of months open for farmers markets by latitude')
	plt.show()



	sns.barplot('people_per_market', 'state', palette = state_colors,
	data = markets_by_state, ax = ax1)
	p = sns.scatterplot('population', 'num_markets', color = state_colors,
	data = markets_by_state, s = 60, ax = ax2)
	ax2.set(xscale = "log", yscale = 'log')
	ax2.annotate(tx_message, xy = (26956958,230),
	xytext = (26956958, 450),ha = 'right',
	size = 15, backgroundcolor = 'white',
	arrowprops = {'facecolor':'black', 'width': 3})
	sns.set_style('whitegrid')
	houston_pollution = pollution[pollution.city == 'Houston']
	houston_colors = ['orangered' if (day == 330) & (year == 2014) else 'lightgray'
	for day,year in zip(houston_pollution.day, houston_pollution.year)]
	sns.regplot(x = 'NO2', y = 'SO2', data = houston_pollution, fit_reg = False,
	scatter_kws = {'facecolors': houston_colors, 'alpha': 0.7})


	houston_pollution = pollution[pollution.city == 'Houston'].copy()
	max_O3 = houston_pollution.O3.max()
	houston_pollution['point_type'] = ['Highest O3 Day' if O3 == max_O3 else 'Others' for O3 in houston_pollution.O3]
	sns.scatterplot(x = 'NO2', y = 'SO2', hue = 'point_type', data = houston_pollution)


	sns.kdeplot(pollution[pollution.year == 2012].O3, shade = True, label = '2012')
	sns.kdeplot(pollution[pollution.year != 2012].O3, shade = True, label = 'other years')


	sns.distplot(pollution[pollution.city == 'Vandenberg Air Force Base'].O3, label = 'Vandenberg',
	hist = False, color = 'steelblue', rug = True)


	pollution_mar = pollution[pollution.month == 3]
	sns.swarmplot(y = "city", x = 'O3', data = pollution_mar, size = 3)
	plt.text(0.57, 41, 'Cincinnati had highest observed\nSO2 value on Aug 11, 2013',
	fontdict = {'ha': 'left', 'size': 'large'})


	plt.annotate('Long Beach New Years', xy = (lb_newyears['CO'], lb_newyears['NO2']), xytext = (2, 15),
	arrowprops = {'facecolor':'gray', 'width': 3, 'shrink': 0.03}, backgroundcolor = 'white')


	is_lb = ['orangered' if city == 'Long Beach' else 'lightgray' for city in pollution['city']]
	sns.regplot(x = 'CO', y = 'O3', data = pollution, fit_reg = False,
	scatter_kws = {'facecolors':is_lb, 'alpha': 0.3})
	g = sns.FacetGrid(data = pollution, col = 'city', col_wrap = 3)
	g.map(sns.scatterplot, 'CO', 'NO2', alpha = 0.2)

	sns.barplot(y = 'city', x = 'CO', estimator = np.mean, ci = False, data = pollution, edgecolor = 'black')
	sns.barplot(y = 'city', x = 'CO', estimator = np.mean, ci = False, data = pollution, color = 'cadetblue')

	color_palette = sns.light_palette('orangered', as_cmap = True)
	sns.scatterplot(x = 'CO', y = 'NO2', hue = 'O3', data = cinci_2014, palette = color_palette)

	color_palette = sns.diverging_palette(250, 0, as_cmap = True)
	sns.heatmap(nov_2015_CO, cmap = color_palette, center = 0, vmin = -4, vmax = 4)

	plt.style.use("dark_background")
	color_palette = sns.diverging_palette(250, 0, center = 'dark', as_cmap = True)
	sns.heatmap(oct_2015_o3, cmap = color_palette, center = 0)


	sns.lineplot(x = "day", y = "CO", hue = "city", palette = "Set2", linewidth = 3, data = pollution_jan13)


	wanted_combos = ['Vandenberg Air Force Base NO2', 'Long Beach CO', 'Cincinnati SO2']
	city_pol_month['color_cats'] = [x if x in wanted_combos else 'other' for x in city_pol_month['city_pol']]
	sns.lineplot(x = "month", y = "value", hue = 'color_cats', units = 'city_pol', estimator = None,
	palette = 'Set2', data = city_pol_month)


	pollution['CO quartile'] = pd.qcut(pollution['CO'], q = 4, labels = False)
	des_moines = pollution.query("city == 'Des Moines'")
	sns.scatterplot(x = 'SO2', y = 'NO2', hue = 'CO quartile', data = des_moines, palette = 'GnBu')


	sns.catplot(x = 'city', hue = 'year', y = 'value', row = 'pollutant',
	data = city_maxes, palette = 'BuGn', sharey = False, kind = 'bar')
	average_ests['lower'] = average_ests['mean'] - 1.96*average_ests['std_err']
	average_ests['upper'] = average_ests['mean'] + 1.96*average_ests['std_err']
	g = sns.FacetGrid(average_ests, row = 'pollutant', sharex = False)
	g.map(plt.hlines, 'y', 'lower', 'upper')
	g.map(plt.scatter, 'seen', 'y', color = 'orangered').set_ylabels('').set_xlabels('')


	plt.hlines(y = 'year', xmin = 'lower', xmax = 'upper', linewidth = 5, color = 'steelblue',
	alpha = 0.7, data = diffs_by_year)
	plt.plot('mean', 'year', 'k\|', data = diffs_by_year)
	plt.axvline(x = 0, color = 'orangered', linestyle = '--')


	vandenberg_NO2['lower'] = vandenberg_NO2['mean'] - 2.58*vandenberg_NO2['std_err']
	vandenberg_NO2['upper'] = vandenberg_NO2['mean'] + 2.58*vandenberg_NO2['std_err']
	plt.plot('day', 'mean', data = vandenberg_NO2, color = 'white', alpha = 0.4)
	plt.fill_between(x = 'day', y1 = 'lower', y2 = 'upper', data = vandenberg_NO2)


	g = sns.FacetGrid(eastern_SO2, col = 'city', col_wrap = 2)
	g.map(plt.fill_between, 'day', 'lower', 'upper', color = 'coral')
	g.map(plt.plot, 'day', 'mean', color = 'white')


	for city, color in [('Denver',"#66c2a5"), ('Long Beach', "#fc8d62")]:
	city_data = SO2_compare[SO2_compare.city == city]
	plt.fill_between(x = 'day', y1 = 'lower', y2 = 'upper', data = city_data, color = color, alpha = 0.4)
	plt.plot('day','mean', data = city_data, label = city, color = color, alpha = 0.25)


	alphas = [ 0.01, 0.05, 0.1]
	widths = [ '99% CI', '95%', '90%']
	colors = ['#fee08b','#fc8d59','#d53e4f']
	for alpha, color, width in zip(alphas, colors, widths):
	conf_ints = pollution_model.conf_int(alpha)
	plt.hlines(y = conf_ints.index, xmin = conf_ints[0], xmax = conf_ints[1],
	colors = color, label = width, linewidth = 10)
	plt.plot(pollution_model.params, pollution_model.params.index, 'wo', label = 'Point Estimate')


	int_widths = ['90%', '99%']
	z_scores = [1.67, 2.58]
	colors = ['#fc8d59', '#fee08b']
	for percent, Z, color in zip(int_widths, z_scores, colors):
	plt.fill_between(
	x = cinci_13_no2.day, alpha = 0.4, color = color,
	y1 = cinci_13_no2['mean'] - Z*cinci_13_no2['std_err'],
	y2 = cinci_13_no2['mean'] + Z*cinci_13_no2['std_err'],
	label = percent)


	sizes = [ 15, 10, 5]
	int_widths = ['90% CI', '95%', '99%']
	z_scores = [ 1.67, 1.96, 2.58]
	for percent, Z, size in zip(int_widths, z_scores, sizes):
	plt.hlines(y = rocket_model.pollutant,
	xmin = rocket_model['est'] - Z*rocket_model['std_err'],
	xmax = rocket_model['est'] + Z*rocket_model['std_err'],
	label = percent,
	linewidth = size,
	color = 'gray')
	plt.plot('est', 'pollutant', 'wo', data = rocket_model, label = 'Point Estimate')
	plt.legend(loc = 'center left', bbox_to_anchor = (1, 0.5))


	cinci_may_NO2 = pollution.query("city == 'Cincinnati' & month == 5").NO2
	boot_means = bootstrap(cinci_may_NO2, 1000)
	lower, upper = np.percentile(boot_means, [2.5, 97.5])
	plt.axvspan(lower, upper, color = 'gray', alpha = 0.2)
	sns.distplot(boot_means, bins = 100, kde = False)


	sns.lmplot('NO2', 'SO2', data = no2_so2_boot,
	hue = 'sample',
	line_kws = {'color': 'steelblue', 'alpha': 0.2},
	ci = None, legend = False, scatter = False)
	plt.scatter('NO2', 'SO2', data = no2_so2)


	city_boots = pd.DataFrame()
	for city in ['Cincinnati', 'Des Moines', 'Indianapolis', 'Houston']:
	city_NO2 = pollution_may[pollution_may.city == city].NO2
	cur_boot = pd.DataFrame({'NO2_avg': bootstrap(city_NO2, 100), 'city':city})
	city_boots = pd.concat([city_boots,cur_boot])
	sns.swarmplot(y = "city", x = "NO2_avg", data = city_boots, color = 'coral')
	first_rows = markets.head(3).transpose()

	col_descriptions = markets.describe(include = 'all',
	percentiles = [0.5]).transpose()

	pd.plotting.scatter_matrix(markets[numeric_columns], figsize = (15,10), alpha = 0.5)

	markets['log_pop'] = np.log(markets['state_pop'])
	sns.scatterplot(x = 'log_pop', y = 'num_items_sold', alpha = 0.25, data = markets)


	sns.regplot(x = 'lat', y = 'months_open', ci = False, data = markets,
	scatter_kws = {'alpha':0.1, 'color':'gray'})


	g = sns.regplot("log_markets", "log_pop", ci = False, scatter_kws = {'s':2},data = markets_and_pop)
	for _, row in markets_and_pop.iterrows():
	state, _, _, log_markets, log_pop = row
	g.annotate(state, (log_markets,log_pop), size=10)


	to_plot = ['Cheese','Maple','Fruits','Grains','Seafood','Plants','Vegetables']
	goods_by_state_small = goods_by_state.query("good in "+str(to_plot))
	g = sns.scatterplot('good','prop_selling', data = goods_by_state_small, s = 0)
	for _,row in goods_by_state_small.iterrows():
	g.annotate(row['state'], (row['good'], row['prop_selling']), ha = 'center', size = 10)