bradleyboehmke · October 21, 2018 15:57
diff --git a/analytics-connect-intro-r-student-script.R b/analytics-connect-intro-r-student-script.R
 ###############################################################################
 #                                 FUNDAMENTALS                                #
 ###############################################################################

 # slide 13 ----------------------------------------------------------------

 # assess where the outputs from these lines of code appear
 mtcars
 ?sum
 hist(mtcars$mpg)
 random_numbers <- runif(25)
 history()


 # slide 14 ----------------------------------------------------------------

 # provides details for specific function
 help(sqrt)
 ?sqrt
 # provides examples for said function
 example(sqrt)


 # slide 15 ----------------------------------------------------------------

 # get your current working directory
 getwd()

 # now set your working directory
 setwd("enter/path/to/your/working/directory")


 # slide 16 ----------------------------------------------------------------

 # uses PEMBDAS convention for order of operations
 4 + 3 / 10 ^ 2
 (4 + 3) / 10 ^ 2
 (4 + 3 / 10) ^ 2

 # calculations with NA produces NA
 4 + 3 / 10 ^ NA

 # assign results to an object
 x <- 4 + 3 / 10 ^ 2


 # slide 17 ----------------------------------------------------------------

 D <- 1000
 K <- 5
 h <- .25

 # now compute Q based on equation




 # slide 18 ----------------------------------------------------------------

 # list all objects
 ls()

 # remove a single object
 rm(D)

 # remove all objects
 rm(list = ls())


 # slide 20 ----------------------------------------------------------------

 # install the tidyverse and nycflights13 packages



 ###############################################################################
 #                               Importing Data                                #
 ###############################################################################

 # slide 26 ----------------------------------------------------------------

 # load readr and readxl packages
 library(readr)
 library(readxl)

 # import .csv data --> may need to adjust path to where you saved the csv file
 my_csv_data <- read_csv("data/mydata.csv") 
 my_csv_data

 # read .xlsx sheet names
 excel_sheets("data/mydata.xlsx")

 # load specified sheet
 my_xlsx_data <- read_excel("data/mydata.xlsx", sheet = "PICK_ME_FIRST!")
 my_xlsx_data


 # slide 28 ----------------------------------------------------------------

 # 1: Read in the CustomerData.csv file and save as customers

 # 2: What spreadsheets are contained in the CustomerData.xlsx file?

 # 3. Read in the spreadsheet that contains the data.



 # slide 30 ----------------------------------------------------------------

 # dimensions (rows x columns)
 dim(customers)

 # get a quick glimpse of the data
 glimpse(customers)

 # get the names of all the variables
 names(customers)

 # how many missing values exist
 sum(is.na(customers))

 # omit all observations with missing values
 clean_data <- na.omit(customers)

 # view the data in a spreadsheet like viewer
 View(customers)


 ###############################################################################
 #                             Transforming Data                               #
 ###############################################################################


 # slide 34 ----------------------------------------------------------------

 # packages required
 library(nycflights13)
 library(dplyr)

 # data used in examples
 flights


 # slide 35 ----------------------------------------------------------------

 # filter for all observations in month 1 (January)
 filter(flights, month == 1)

 # filter for all observations on January 1st
 filter(flights, month == 1, day == 1)

 # filter for all observations on January 1st with a departure delay
 filter(flights, month == 1, day == 1, dep_delay > 0)


 # slide 36 ----------------------------------------------------------------

 # dplyr functions do not over-write data; must save to a new data frame object
 dec25 <- filter(flights, month == 12, day == 25)
 dec25


 # slide 37 ----------------------------------------------------------------

 # what will these produce?
 filter(flights, month == 12)
 filter(flights, month != 12)
 filter(flights, month %in% c(11, 12))
 filter(flights, arr_delay <= 120)
 filter(flights, !(arr_delay <= 120))
 filter(flights, is.na(tailnum))


 # slide 38 ----------------------------------------------------------------

 # set 1
 filter(flights, month == 12, day == 25)
 filter(flights, month == 12 & day == 25)

 # set 2
 filter(flights, month == 11 | month == 12)
 filter(flights, month %in% c(11, 12))

 # set 3 --> are these the same?
 filter(flights, !(arr_delay > 120 | dep_delay > 120))
 filter(flights, arr_delay <= 120, dep_delay <= 120)



 # slide 39 ----------------------------------------------------------------

 # 1: Import the CustomerData.csv file.

 # 2: Filter for female customers only.

 # 3: Filter for female customers that are greater than 45 years old and live in region 3.

 # 4: Filter for female customers that are greater than 45 years old or live in region 3.


 # slide 40 ----------------------------------------------------------------

 # select variables of interest
 select(flights, year, month, day)
 select(flights, year:day)


 # slide 41 ----------------------------------------------------------------

 # helper functions
 select(flights, starts_with("arr"))
 select(flights, ends_with("time"))
 select(flights, c(carrier, ends_with("time"), contains("arr")))


 # slide 42 ----------------------------------------------------------------

 # 1: Using the customer data, select all columns between CustomerID and Gender.

 # 2: Now select all columns except those between and including CustomerID and Gender.

 # 3: Select CustomerID and all variables that contain the word "Card".


 # slide 43 ----------------------------------------------------------------

 # reorder data
 arrange(flights, dep_time)
 arrange(flights, desc(dep_time))


 # slide 45 ----------------------------------------------------------------

 # 1: Select the variables CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth 
 #    and save this as sub_cust.

 # 2: Order sub_cust data by Age and CardSpendMonth (ascending order)

 # 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most)



 # slide 46 ----------------------------------------------------------------

 # create smaller data set so we can see the results
 flights_sml <- select(flights, ends_with("delay"), distance, air_time)
 flights_sml

 # create two new variables (gain, speed)
 mutate(flights_sml,
       gain = arr_delay - dep_delay,
       speed = distance / air_time * 60
 )

 # mutate creates variables in order so we can create 
 # variables from recently created variables
 mutate(flights_sml,
       gain = arr_delay - dep_delay,
       hours = air_time / 60,
       gain_per_hour = gain / hours
 )


 # slide 47 ----------------------------------------------------------------

 # mean center data
 transmute(flights, center_delay = dep_delay / mean(dep_delay, na.rm = TRUE))

 # transform values
 transmute(flights,
          log_air_time = log2(air_time),
          exp_delay = exp(dep_delay)
 )

 # lag and cumsum values
 transmute(flights,
          dep_delay = dep_delay,
          lag_delay = lag(dep_delay),
          sum_delay = cumsum(dep_delay)
 )


 # slide 48 ----------------------------------------------------------------

 # Using your customers data...

 # 1: Create a ratio variable that computes the ratio of CardSpendMonth to HHIncome

 # 2: Create two variables:
 #      ratio1 = CardSpendMonth / HHIncome
 #      ratio2 = CardSpendMonth / Age



 # slide 49 ----------------------------------------------------------------

 # compute single summary statistic 
 summarize(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE))

 # compute multiple summary statistics
 summarize(flights, 
          dep_delay_mean = mean(dep_delay, na.rm = TRUE),
          dep_delay_sd = sd(dep_delay, na.rm = TRUE),
          n = n()
 )


 # slide 51 ----------------------------------------------------------------

 # compute grouped summary statistics

 # group by month then compute average departure delay
 by_month <- group_by(flights, month)
 summarize(by_month, avg_delay = mean(dep_delay, na.rm = TRUE))

 # group by carrier then compute standard deviation of departure deplay
 by_carrier <- group_by(flights, carrier)
 summarize(by_carrier, delay_sd = sd(dep_delay, na.rm = TRUE))


 # slide 53 ----------------------------------------------------------------

 # 1: In our customers data, compute the average CardSpendMonth across all customers.

 # 2: Now compute the average CardSpendMonth for each gender.

 # 3: Now compute the average CardSpendMonth for each gender and region. Which 
 #    gender and region have the highest average spend?



 # slide 55 ----------------------------------------------------------------

 # traditional approach
 by_gdr_rgn <- group_by(customers, Gender, Region)
 avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
 arrange(avg_gdr_rgn, desc(Avg_spend))

 # pipe operator approach
 sub_cust %>%
  group_by(Gender, Region) %>%
  summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%
  arrange(desc(Avg_spend))


 # slide 56 ----------------------------------------------------------------

 # Using the pipe operator follow these steps with the customers data:
  
 # 1: filter for male customers only
 # 2: create a new variable: ratio = CardSpendMonth / HHIncome
 # 3: group this data by age
 # 4: compute the mean of the new ratio variable by age
 # 5: sort this output to find the age with the highest ratio of expenditures to income.



 ###############################################################################
 #                               Visualizing Data                              #
 ###############################################################################


 # slide 61 ----------------------------------------------------------------

 library(ggplot2)
 mpg


 # slide 62 ----------------------------------------------------------------

 # create basic canvas for plot
 ggplot(data = mpg)
 ggplot(data = mpg, aes(x = displ, y = hwy))


 # slides 64-65 ------------------------------------------------------------

 # univariate geoms
 ggplot(data = mpg, aes(x = hwy)) +
  geom_histogram()

 ggplot(data = mpg, aes(x = hwy)) +
  geom_freqpoly()

 ggplot(data = mpg, aes(x = hwy)) +
  geom_density()

 # bivariate geoms
 ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point()

 ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_boxplot()

 ggplot(data = mpg, aes(x = class, y = hwy)) +
  geom_violin()


 # slide 66 ----------------------------------------------------------------

 # Using the customers data:
  
 # 1: Create a chart that illustrates the distribution of the DebtToIncomeRatio variable.

 # 2: Create a chart that shows the counts for each JobCategory

 # 3: Create a scatter plot of HHIncome vs CardSpendMonth



 # slide 67 ----------------------------------------------------------------

 # many non-mapping aesthetics (color, size, shape, transparency)
 ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue", size = 2, shape = 17, alpha = .5)


 # slide 69 ----------------------------------------------------------------

 # non-mapping color aesthetic
 ggplot(data = mpg, aes(x = displ, y = hwy)) +
  geom_point(color = "blue")

 # mapping color aesthetic to class variable
 ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
  geom_point()


 # slide 70 ----------------------------------------------------------------

 # 1: Create a scatter plot of HHIncome vs CardSpendMonth and color all points blue.

 # 2: Create a scatter plot of HHIncome vs CardSpendMonth and color all points based 
 #    on whether or not the customer is retired.



 # slide 71 ----------------------------------------------------------------

 # facet_wrap for single variable small multiples
 ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_wrap(~ class, nrow = 2)

 # facet_grid for two variable small multiples
 ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_point() + 
  facet_grid(drv ~ cyl)


 # slide 72 ----------------------------------------------------------------

 # 1: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory.

 # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and Gender.

 # 3: Assess UnionMember across each JobCategory.


 # slide 73 ----------------------------------------------------------------

 # adding titles with ggtitle
 ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_jitter() +
  ggtitle("Displacement vs Highway MPG",
          subtitle = "Data from 1999 & 2008")

 # adding titles with labs
 ggplot(data = mpg, aes(x = displ, y = hwy)) + 
  geom_jitter() +
  labs(
    title = "Displacement vs Highway MPG",
    subtitle = "Data from 1999 & 2008",
    caption = "http://fueleconomy.gov"
  )


 # slide 74 ----------------------------------------------------------------

 # adjusting x and y axis parameters
 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .25) +
  scale_x_log10()

 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .25)  +
  scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
  scale_x_log10(name = "Total Sales Volume", labels = scales::comma)


 # slide 75 ----------------------------------------------------------------

 # putting it all together
 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .15) +
  scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
  scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
  labs(
    title = "Texas Housing Sales",
    subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center",
    caption = " http://recenter.tamu.edu/"
  )


 # slide 76 ----------------------------------------------------------------

 # 1: Remove all missing values from the customers data and then...
 # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and...
 # 3: add a title, subtitle, and nicely format the axes.



 # slide 77 ----------------------------------------------------------------

 # examples of overplotting
 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .1)  +
  scale_x_log10() +
  geom_smooth()

 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .1)  +
  scale_x_log10() +
  geom_smooth(method = "lm")

 ggplot(data = txhousing, aes(x = volume, y = median)) + 
  geom_point(alpha = .1)  +
  scale_x_log10() +
  geom_smooth(method = "lm") +
  facet_wrap(~ month)


 # slide 78 ----------------------------------------------------------------

 # 1: Remove all missing values from the customer data and then…
 # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and…
 # 3: add a title, subtitle, and nicely format the axes and…
 # 4: add a linear line to assess if the slope changes across JobCategory
	###############################################################################
	# FUNDAMENTALS #
	###############################################################################

	# slide 13 ----------------------------------------------------------------

	# assess where the outputs from these lines of code appear
	mtcars
	?sum
	hist(mtcars$mpg)
	random_numbers <- runif(25)
	history()


	# slide 14 ----------------------------------------------------------------

	# provides details for specific function
	help(sqrt)
	?sqrt
	# provides examples for said function
	example(sqrt)


	# slide 15 ----------------------------------------------------------------

	# get your current working directory
	getwd()

	# now set your working directory
	setwd("enter/path/to/your/working/directory")


	# slide 16 ----------------------------------------------------------------

	# uses PEMBDAS convention for order of operations
	4 + 3 / 10 ^ 2
	(4 + 3) / 10 ^ 2
	(4 + 3 / 10) ^ 2

	# calculations with NA produces NA
	4 + 3 / 10 ^ NA

	# assign results to an object
	x <- 4 + 3 / 10 ^ 2


	# slide 17 ----------------------------------------------------------------

	D <- 1000
	K <- 5
	h <- .25

	# now compute Q based on equation




	# slide 18 ----------------------------------------------------------------

	# list all objects
	ls()

	# remove a single object
	rm(D)

	# remove all objects
	rm(list = ls())


	# slide 20 ----------------------------------------------------------------

	# install the tidyverse and nycflights13 packages



	###############################################################################
	# Importing Data #
	###############################################################################

	# slide 26 ----------------------------------------------------------------

	# load readr and readxl packages
	library(readr)
	library(readxl)

	# import .csv data --> may need to adjust path to where you saved the csv file
	my_csv_data <- read_csv("data/mydata.csv")
	my_csv_data

	# read .xlsx sheet names
	excel_sheets("data/mydata.xlsx")

	# load specified sheet
	my_xlsx_data <- read_excel("data/mydata.xlsx", sheet = "PICK_ME_FIRST!")
	my_xlsx_data


	# slide 28 ----------------------------------------------------------------

	# 1: Read in the CustomerData.csv file and save as customers

	# 2: What spreadsheets are contained in the CustomerData.xlsx file?

	# 3. Read in the spreadsheet that contains the data.



	# slide 30 ----------------------------------------------------------------

	# dimensions (rows x columns)
	dim(customers)

	# get a quick glimpse of the data
	glimpse(customers)

	# get the names of all the variables
	names(customers)

	# how many missing values exist
	sum(is.na(customers))

	# omit all observations with missing values
	clean_data <- na.omit(customers)

	# view the data in a spreadsheet like viewer
	View(customers)


	###############################################################################
	# Transforming Data #
	###############################################################################


	# slide 34 ----------------------------------------------------------------

	# packages required
	library(nycflights13)
	library(dplyr)

	# data used in examples
	flights


	# slide 35 ----------------------------------------------------------------

	# filter for all observations in month 1 (January)
	filter(flights, month == 1)

	# filter for all observations on January 1st
	filter(flights, month == 1, day == 1)

	# filter for all observations on January 1st with a departure delay
	filter(flights, month == 1, day == 1, dep_delay > 0)


	# slide 36 ----------------------------------------------------------------

	# dplyr functions do not over-write data; must save to a new data frame object
	dec25 <- filter(flights, month == 12, day == 25)
	dec25


	# slide 37 ----------------------------------------------------------------

	# what will these produce?
	filter(flights, month == 12)
	filter(flights, month != 12)
	filter(flights, month %in% c(11, 12))
	filter(flights, arr_delay <= 120)
	filter(flights, !(arr_delay <= 120))
	filter(flights, is.na(tailnum))


	# slide 38 ----------------------------------------------------------------

	# set 1
	filter(flights, month == 12, day == 25)
	filter(flights, month == 12 & day == 25)

	# set 2
	filter(flights, month == 11 \| month == 12)
	filter(flights, month %in% c(11, 12))

	# set 3 --> are these the same?
	filter(flights, !(arr_delay > 120 \| dep_delay > 120))
	filter(flights, arr_delay <= 120, dep_delay <= 120)



	# slide 39 ----------------------------------------------------------------

	# 1: Import the CustomerData.csv file.

	# 2: Filter for female customers only.

	# 3: Filter for female customers that are greater than 45 years old and live in region 3.

	# 4: Filter for female customers that are greater than 45 years old or live in region 3.


	# slide 40 ----------------------------------------------------------------

	# select variables of interest
	select(flights, year, month, day)
	select(flights, year:day)


	# slide 41 ----------------------------------------------------------------

	# helper functions
	select(flights, starts_with("arr"))
	select(flights, ends_with("time"))
	select(flights, c(carrier, ends_with("time"), contains("arr")))


	# slide 42 ----------------------------------------------------------------

	# 1: Using the customer data, select all columns between CustomerID and Gender.

	# 2: Now select all columns except those between and including CustomerID and Gender.

	# 3: Select CustomerID and all variables that contain the word "Card".


	# slide 43 ----------------------------------------------------------------

	# reorder data
	arrange(flights, dep_time)
	arrange(flights, desc(dep_time))


	# slide 45 ----------------------------------------------------------------

	# 1: Select the variables CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth
	# and save this as sub_cust.

	# 2: Order sub_cust data by Age and CardSpendMonth (ascending order)

	# 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most)



	# slide 46 ----------------------------------------------------------------

	# create smaller data set so we can see the results
	flights_sml <- select(flights, ends_with("delay"), distance, air_time)
	flights_sml

	# create two new variables (gain, speed)
	mutate(flights_sml,
	gain = arr_delay - dep_delay,
	speed = distance / air_time * 60
	)

	# mutate creates variables in order so we can create
	# variables from recently created variables
	mutate(flights_sml,
	gain = arr_delay - dep_delay,
	hours = air_time / 60,
	gain_per_hour = gain / hours
	)


	# slide 47 ----------------------------------------------------------------

	# mean center data
	transmute(flights, center_delay = dep_delay / mean(dep_delay, na.rm = TRUE))

	# transform values
	transmute(flights,
	log_air_time = log2(air_time),
	exp_delay = exp(dep_delay)
	)

	# lag and cumsum values
	transmute(flights,
	dep_delay = dep_delay,
	lag_delay = lag(dep_delay),
	sum_delay = cumsum(dep_delay)
	)


	# slide 48 ----------------------------------------------------------------

	# Using your customers data...

	# 1: Create a ratio variable that computes the ratio of CardSpendMonth to HHIncome

	# 2: Create two variables:
	# ratio1 = CardSpendMonth / HHIncome
	# ratio2 = CardSpendMonth / Age



	# slide 49 ----------------------------------------------------------------

	# compute single summary statistic
	summarize(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE))

	# compute multiple summary statistics
	summarize(flights,
	dep_delay_mean = mean(dep_delay, na.rm = TRUE),
	dep_delay_sd = sd(dep_delay, na.rm = TRUE),
	n = n()
	)


	# slide 51 ----------------------------------------------------------------

	# compute grouped summary statistics

	# group by month then compute average departure delay
	by_month <- group_by(flights, month)
	summarize(by_month, avg_delay = mean(dep_delay, na.rm = TRUE))

	# group by carrier then compute standard deviation of departure deplay
	by_carrier <- group_by(flights, carrier)
	summarize(by_carrier, delay_sd = sd(dep_delay, na.rm = TRUE))


	# slide 53 ----------------------------------------------------------------

	# 1: In our customers data, compute the average CardSpendMonth across all customers.

	# 2: Now compute the average CardSpendMonth for each gender.

	# 3: Now compute the average CardSpendMonth for each gender and region. Which
	# gender and region have the highest average spend?



	# slide 55 ----------------------------------------------------------------

	# traditional approach
	by_gdr_rgn <- group_by(customers, Gender, Region)
	avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
	arrange(avg_gdr_rgn, desc(Avg_spend))

	# pipe operator approach
	sub_cust %>%
	group_by(Gender, Region) %>%
	summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%
	arrange(desc(Avg_spend))


	# slide 56 ----------------------------------------------------------------

	# Using the pipe operator follow these steps with the customers data:

	# 1: filter for male customers only
	# 2: create a new variable: ratio = CardSpendMonth / HHIncome
	# 3: group this data by age
	# 4: compute the mean of the new ratio variable by age
	# 5: sort this output to find the age with the highest ratio of expenditures to income.



	###############################################################################
	# Visualizing Data #
	###############################################################################


	# slide 61 ----------------------------------------------------------------

	library(ggplot2)
	mpg


	# slide 62 ----------------------------------------------------------------

	# create basic canvas for plot
	ggplot(data = mpg)
	ggplot(data = mpg, aes(x = displ, y = hwy))


	# slides 64-65 ------------------------------------------------------------

	# univariate geoms
	ggplot(data = mpg, aes(x = hwy)) +
	geom_histogram()

	ggplot(data = mpg, aes(x = hwy)) +
	geom_freqpoly()

	ggplot(data = mpg, aes(x = hwy)) +
	geom_density()

	# bivariate geoms
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_point()

	ggplot(data = mpg, aes(x = class, y = hwy)) +
	geom_boxplot()

	ggplot(data = mpg, aes(x = class, y = hwy)) +
	geom_violin()


	# slide 66 ----------------------------------------------------------------

	# Using the customers data:

	# 1: Create a chart that illustrates the distribution of the DebtToIncomeRatio variable.

	# 2: Create a chart that shows the counts for each JobCategory

	# 3: Create a scatter plot of HHIncome vs CardSpendMonth



	# slide 67 ----------------------------------------------------------------

	# many non-mapping aesthetics (color, size, shape, transparency)
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_point(color = "blue", size = 2, shape = 17, alpha = .5)


	# slide 69 ----------------------------------------------------------------

	# non-mapping color aesthetic
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_point(color = "blue")

	# mapping color aesthetic to class variable
	ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
	geom_point()


	# slide 70 ----------------------------------------------------------------

	# 1: Create a scatter plot of HHIncome vs CardSpendMonth and color all points blue.

	# 2: Create a scatter plot of HHIncome vs CardSpendMonth and color all points based
	# on whether or not the customer is retired.



	# slide 71 ----------------------------------------------------------------

	# facet_wrap for single variable small multiples
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_point() +
	facet_wrap(~ class, nrow = 2)

	# facet_grid for two variable small multiples
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_point() +
	facet_grid(drv ~ cyl)


	# slide 72 ----------------------------------------------------------------

	# 1: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory.

	# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and Gender.

	# 3: Assess UnionMember across each JobCategory.


	# slide 73 ----------------------------------------------------------------

	# adding titles with ggtitle
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_jitter() +
	ggtitle("Displacement vs Highway MPG",
	subtitle = "Data from 1999 & 2008")

	# adding titles with labs
	ggplot(data = mpg, aes(x = displ, y = hwy)) +
	geom_jitter() +
	labs(
	title = "Displacement vs Highway MPG",
	subtitle = "Data from 1999 & 2008",
	caption = "http://fueleconomy.gov"
	)


	# slide 74 ----------------------------------------------------------------

	# adjusting x and y axis parameters
	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .25) +
	scale_x_log10()

	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .25) +
	scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
	scale_x_log10(name = "Total Sales Volume", labels = scales::comma)


	# slide 75 ----------------------------------------------------------------

	# putting it all together
	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .15) +
	scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
	scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
	labs(
	title = "Texas Housing Sales",
	subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center",
	caption = " http://recenter.tamu.edu/"
	)


	# slide 76 ----------------------------------------------------------------

	# 1: Remove all missing values from the customers data and then...
	# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and...
	# 3: add a title, subtitle, and nicely format the axes.



	# slide 77 ----------------------------------------------------------------

	# examples of overplotting
	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .1) +
	scale_x_log10() +
	geom_smooth()

	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .1) +
	scale_x_log10() +
	geom_smooth(method = "lm")

	ggplot(data = txhousing, aes(x = volume, y = median)) +
	geom_point(alpha = .1) +
	scale_x_log10() +
	geom_smooth(method = "lm") +
	facet_wrap(~ month)


	# slide 78 ----------------------------------------------------------------

	# 1: Remove all missing values from the customer data and then…
	# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and…
	# 3: add a title, subtitle, and nicely format the axes and…
	# 4: add a linear line to assess if the slope changes across JobCategory
No results found