Last active
October 21, 2018 15:57
-
-
Save bradleyboehmke/11e62a0a750a2fa8b9924090c40b49ae to your computer and use it in GitHub Desktop.
Student script for 2018 Analytics Connect Intro to R workshop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############################################################################### | |
| # FUNDAMENTALS # | |
| ############################################################################### | |
| # slide 13 ---------------------------------------------------------------- | |
| # assess where the outputs from these lines of code appear | |
| mtcars | |
| ?sum | |
| hist(mtcars$mpg) | |
| random_numbers <- runif(25) | |
| history() | |
| # slide 14 ---------------------------------------------------------------- | |
| # provides details for specific function | |
| help(sqrt) | |
| ?sqrt | |
| # provides examples for said function | |
| example(sqrt) | |
| # slide 15 ---------------------------------------------------------------- | |
| # get your current working directory | |
| getwd() | |
| # now set your working directory | |
| setwd("enter/path/to/your/working/directory") | |
| # slide 16 ---------------------------------------------------------------- | |
| # uses PEMBDAS convention for order of operations | |
| 4 + 3 / 10 ^ 2 | |
| (4 + 3) / 10 ^ 2 | |
| (4 + 3 / 10) ^ 2 | |
| # calculations with NA produces NA | |
| 4 + 3 / 10 ^ NA | |
| # assign results to an object | |
| x <- 4 + 3 / 10 ^ 2 | |
| # slide 17 ---------------------------------------------------------------- | |
| D <- 1000 | |
| K <- 5 | |
| h <- .25 | |
| # now compute Q based on equation | |
| # slide 18 ---------------------------------------------------------------- | |
| # list all objects | |
| ls() | |
| # remove a single object | |
| rm(D) | |
| # remove all objects | |
| rm(list = ls()) | |
| # slide 20 ---------------------------------------------------------------- | |
| # install the tidyverse and nycflights13 packages | |
| ############################################################################### | |
| # Importing Data # | |
| ############################################################################### | |
| # slide 26 ---------------------------------------------------------------- | |
| # load readr and readxl packages | |
| library(readr) | |
| library(readxl) | |
| # import .csv data --> may need to adjust path to where you saved the csv file | |
| my_csv_data <- read_csv("data/mydata.csv") | |
| my_csv_data | |
| # read .xlsx sheet names | |
| excel_sheets("data/mydata.xlsx") | |
| # load specified sheet | |
| my_xlsx_data <- read_excel("data/mydata.xlsx", sheet = "PICK_ME_FIRST!") | |
| my_xlsx_data | |
| # slide 28 ---------------------------------------------------------------- | |
| # 1: Read in the CustomerData.csv file and save as customers | |
| # 2: What spreadsheets are contained in the CustomerData.xlsx file? | |
| # 3. Read in the spreadsheet that contains the data. | |
| # slide 30 ---------------------------------------------------------------- | |
| # dimensions (rows x columns) | |
| dim(customers) | |
| # get a quick glimpse of the data | |
| glimpse(customers) | |
| # get the names of all the variables | |
| names(customers) | |
| # how many missing values exist | |
| sum(is.na(customers)) | |
| # omit all observations with missing values | |
| clean_data <- na.omit(customers) | |
| # view the data in a spreadsheet like viewer | |
| View(customers) | |
| ############################################################################### | |
| # Transforming Data # | |
| ############################################################################### | |
| # slide 34 ---------------------------------------------------------------- | |
| # packages required | |
| library(nycflights13) | |
| library(dplyr) | |
| # data used in examples | |
| flights | |
| # slide 35 ---------------------------------------------------------------- | |
| # filter for all observations in month 1 (January) | |
| filter(flights, month == 1) | |
| # filter for all observations on January 1st | |
| filter(flights, month == 1, day == 1) | |
| # filter for all observations on January 1st with a departure delay | |
| filter(flights, month == 1, day == 1, dep_delay > 0) | |
| # slide 36 ---------------------------------------------------------------- | |
| # dplyr functions do not over-write data; must save to a new data frame object | |
| dec25 <- filter(flights, month == 12, day == 25) | |
| dec25 | |
| # slide 37 ---------------------------------------------------------------- | |
| # what will these produce? | |
| filter(flights, month == 12) | |
| filter(flights, month != 12) | |
| filter(flights, month %in% c(11, 12)) | |
| filter(flights, arr_delay <= 120) | |
| filter(flights, !(arr_delay <= 120)) | |
| filter(flights, is.na(tailnum)) | |
| # slide 38 ---------------------------------------------------------------- | |
| # set 1 | |
| filter(flights, month == 12, day == 25) | |
| filter(flights, month == 12 & day == 25) | |
| # set 2 | |
| filter(flights, month == 11 | month == 12) | |
| filter(flights, month %in% c(11, 12)) | |
| # set 3 --> are these the same? | |
| filter(flights, !(arr_delay > 120 | dep_delay > 120)) | |
| filter(flights, arr_delay <= 120, dep_delay <= 120) | |
| # slide 39 ---------------------------------------------------------------- | |
| # 1: Import the CustomerData.csv file. | |
| # 2: Filter for female customers only. | |
| # 3: Filter for female customers that are greater than 45 years old and live in region 3. | |
| # 4: Filter for female customers that are greater than 45 years old or live in region 3. | |
| # slide 40 ---------------------------------------------------------------- | |
| # select variables of interest | |
| select(flights, year, month, day) | |
| select(flights, year:day) | |
| # slide 41 ---------------------------------------------------------------- | |
| # helper functions | |
| select(flights, starts_with("arr")) | |
| select(flights, ends_with("time")) | |
| select(flights, c(carrier, ends_with("time"), contains("arr"))) | |
| # slide 42 ---------------------------------------------------------------- | |
| # 1: Using the customer data, select all columns between CustomerID and Gender. | |
| # 2: Now select all columns except those between and including CustomerID and Gender. | |
| # 3: Select CustomerID and all variables that contain the word "Card". | |
| # slide 43 ---------------------------------------------------------------- | |
| # reorder data | |
| arrange(flights, dep_time) | |
| arrange(flights, desc(dep_time)) | |
| # slide 45 ---------------------------------------------------------------- | |
| # 1: Select the variables CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth | |
| # and save this as sub_cust. | |
| # 2: Order sub_cust data by Age and CardSpendMonth (ascending order) | |
| # 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most) | |
| # slide 46 ---------------------------------------------------------------- | |
| # create smaller data set so we can see the results | |
| flights_sml <- select(flights, ends_with("delay"), distance, air_time) | |
| flights_sml | |
| # create two new variables (gain, speed) | |
| mutate(flights_sml, | |
| gain = arr_delay - dep_delay, | |
| speed = distance / air_time * 60 | |
| ) | |
| # mutate creates variables in order so we can create | |
| # variables from recently created variables | |
| mutate(flights_sml, | |
| gain = arr_delay - dep_delay, | |
| hours = air_time / 60, | |
| gain_per_hour = gain / hours | |
| ) | |
| # slide 47 ---------------------------------------------------------------- | |
| # mean center data | |
| transmute(flights, center_delay = dep_delay / mean(dep_delay, na.rm = TRUE)) | |
| # transform values | |
| transmute(flights, | |
| log_air_time = log2(air_time), | |
| exp_delay = exp(dep_delay) | |
| ) | |
| # lag and cumsum values | |
| transmute(flights, | |
| dep_delay = dep_delay, | |
| lag_delay = lag(dep_delay), | |
| sum_delay = cumsum(dep_delay) | |
| ) | |
| # slide 48 ---------------------------------------------------------------- | |
| # Using your customers data... | |
| # 1: Create a ratio variable that computes the ratio of CardSpendMonth to HHIncome | |
| # 2: Create two variables: | |
| # ratio1 = CardSpendMonth / HHIncome | |
| # ratio2 = CardSpendMonth / Age | |
| # slide 49 ---------------------------------------------------------------- | |
| # compute single summary statistic | |
| summarize(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE)) | |
| # compute multiple summary statistics | |
| summarize(flights, | |
| dep_delay_mean = mean(dep_delay, na.rm = TRUE), | |
| dep_delay_sd = sd(dep_delay, na.rm = TRUE), | |
| n = n() | |
| ) | |
| # slide 51 ---------------------------------------------------------------- | |
| # compute grouped summary statistics | |
| # group by month then compute average departure delay | |
| by_month <- group_by(flights, month) | |
| summarize(by_month, avg_delay = mean(dep_delay, na.rm = TRUE)) | |
| # group by carrier then compute standard deviation of departure deplay | |
| by_carrier <- group_by(flights, carrier) | |
| summarize(by_carrier, delay_sd = sd(dep_delay, na.rm = TRUE)) | |
| # slide 53 ---------------------------------------------------------------- | |
| # 1: In our customers data, compute the average CardSpendMonth across all customers. | |
| # 2: Now compute the average CardSpendMonth for each gender. | |
| # 3: Now compute the average CardSpendMonth for each gender and region. Which | |
| # gender and region have the highest average spend? | |
| # slide 55 ---------------------------------------------------------------- | |
| # traditional approach | |
| by_gdr_rgn <- group_by(customers, Gender, Region) | |
| avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) | |
| arrange(avg_gdr_rgn, desc(Avg_spend)) | |
| # pipe operator approach | |
| sub_cust %>% | |
| group_by(Gender, Region) %>% | |
| summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>% | |
| arrange(desc(Avg_spend)) | |
| # slide 56 ---------------------------------------------------------------- | |
| # Using the pipe operator follow these steps with the customers data: | |
| # 1: filter for male customers only | |
| # 2: create a new variable: ratio = CardSpendMonth / HHIncome | |
| # 3: group this data by age | |
| # 4: compute the mean of the new ratio variable by age | |
| # 5: sort this output to find the age with the highest ratio of expenditures to income. | |
| ############################################################################### | |
| # Visualizing Data # | |
| ############################################################################### | |
| # slide 61 ---------------------------------------------------------------- | |
| library(ggplot2) | |
| mpg | |
| # slide 62 ---------------------------------------------------------------- | |
| # create basic canvas for plot | |
| ggplot(data = mpg) | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) | |
| # slides 64-65 ------------------------------------------------------------ | |
| # univariate geoms | |
| ggplot(data = mpg, aes(x = hwy)) + | |
| geom_histogram() | |
| ggplot(data = mpg, aes(x = hwy)) + | |
| geom_freqpoly() | |
| ggplot(data = mpg, aes(x = hwy)) + | |
| geom_density() | |
| # bivariate geoms | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_point() | |
| ggplot(data = mpg, aes(x = class, y = hwy)) + | |
| geom_boxplot() | |
| ggplot(data = mpg, aes(x = class, y = hwy)) + | |
| geom_violin() | |
| # slide 66 ---------------------------------------------------------------- | |
| # Using the customers data: | |
| # 1: Create a chart that illustrates the distribution of the DebtToIncomeRatio variable. | |
| # 2: Create a chart that shows the counts for each JobCategory | |
| # 3: Create a scatter plot of HHIncome vs CardSpendMonth | |
| # slide 67 ---------------------------------------------------------------- | |
| # many non-mapping aesthetics (color, size, shape, transparency) | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_point(color = "blue", size = 2, shape = 17, alpha = .5) | |
| # slide 69 ---------------------------------------------------------------- | |
| # non-mapping color aesthetic | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_point(color = "blue") | |
| # mapping color aesthetic to class variable | |
| ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) + | |
| geom_point() | |
| # slide 70 ---------------------------------------------------------------- | |
| # 1: Create a scatter plot of HHIncome vs CardSpendMonth and color all points blue. | |
| # 2: Create a scatter plot of HHIncome vs CardSpendMonth and color all points based | |
| # on whether or not the customer is retired. | |
| # slide 71 ---------------------------------------------------------------- | |
| # facet_wrap for single variable small multiples | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_point() + | |
| facet_wrap(~ class, nrow = 2) | |
| # facet_grid for two variable small multiples | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_point() + | |
| facet_grid(drv ~ cyl) | |
| # slide 72 ---------------------------------------------------------------- | |
| # 1: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory. | |
| # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and Gender. | |
| # 3: Assess UnionMember across each JobCategory. | |
| # slide 73 ---------------------------------------------------------------- | |
| # adding titles with ggtitle | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_jitter() + | |
| ggtitle("Displacement vs Highway MPG", | |
| subtitle = "Data from 1999 & 2008") | |
| # adding titles with labs | |
| ggplot(data = mpg, aes(x = displ, y = hwy)) + | |
| geom_jitter() + | |
| labs( | |
| title = "Displacement vs Highway MPG", | |
| subtitle = "Data from 1999 & 2008", | |
| caption = "http://fueleconomy.gov" | |
| ) | |
| # slide 74 ---------------------------------------------------------------- | |
| # adjusting x and y axis parameters | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .25) + | |
| scale_x_log10() | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .25) + | |
| scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) + | |
| scale_x_log10(name = "Total Sales Volume", labels = scales::comma) | |
| # slide 75 ---------------------------------------------------------------- | |
| # putting it all together | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .15) + | |
| scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) + | |
| scale_x_log10(name = "Total Sales Volume", labels = scales::comma) + | |
| labs( | |
| title = "Texas Housing Sales", | |
| subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center", | |
| caption = " http://recenter.tamu.edu/" | |
| ) | |
| # slide 76 ---------------------------------------------------------------- | |
| # 1: Remove all missing values from the customers data and then... | |
| # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and... | |
| # 3: add a title, subtitle, and nicely format the axes. | |
| # slide 77 ---------------------------------------------------------------- | |
| # examples of overplotting | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .1) + | |
| scale_x_log10() + | |
| geom_smooth() | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .1) + | |
| scale_x_log10() + | |
| geom_smooth(method = "lm") | |
| ggplot(data = txhousing, aes(x = volume, y = median)) + | |
| geom_point(alpha = .1) + | |
| scale_x_log10() + | |
| geom_smooth(method = "lm") + | |
| facet_wrap(~ month) | |
| # slide 78 ---------------------------------------------------------------- | |
| # 1: Remove all missing values from the customer data and then… | |
| # 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and… | |
| # 3: add a title, subtitle, and nicely format the axes and… | |
| # 4: add a linear line to assess if the slope changes across JobCategory | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment