Skip to content

Instantly share code, notes, and snippets.

@bradleyboehmke
Last active October 21, 2018 15:57
Show Gist options
  • Select an option

  • Save bradleyboehmke/11e62a0a750a2fa8b9924090c40b49ae to your computer and use it in GitHub Desktop.

Select an option

Save bradleyboehmke/11e62a0a750a2fa8b9924090c40b49ae to your computer and use it in GitHub Desktop.
Student script for 2018 Analytics Connect Intro to R workshop
###############################################################################
# FUNDAMENTALS #
###############################################################################
# slide 13 ----------------------------------------------------------------
# assess where the outputs from these lines of code appear
mtcars
?sum
hist(mtcars$mpg)
random_numbers <- runif(25)
history()
# slide 14 ----------------------------------------------------------------
# provides details for specific function
help(sqrt)
?sqrt
# provides examples for said function
example(sqrt)
# slide 15 ----------------------------------------------------------------
# get your current working directory
getwd()
# now set your working directory
setwd("enter/path/to/your/working/directory")
# slide 16 ----------------------------------------------------------------
# uses PEMBDAS convention for order of operations
4 + 3 / 10 ^ 2
(4 + 3) / 10 ^ 2
(4 + 3 / 10) ^ 2
# calculations with NA produces NA
4 + 3 / 10 ^ NA
# assign results to an object
x <- 4 + 3 / 10 ^ 2
# slide 17 ----------------------------------------------------------------
D <- 1000
K <- 5
h <- .25
# now compute Q based on equation
# slide 18 ----------------------------------------------------------------
# list all objects
ls()
# remove a single object
rm(D)
# remove all objects
rm(list = ls())
# slide 20 ----------------------------------------------------------------
# install the tidyverse and nycflights13 packages
###############################################################################
# Importing Data #
###############################################################################
# slide 26 ----------------------------------------------------------------
# load readr and readxl packages
library(readr)
library(readxl)
# import .csv data --> may need to adjust path to where you saved the csv file
my_csv_data <- read_csv("data/mydata.csv")
my_csv_data
# read .xlsx sheet names
excel_sheets("data/mydata.xlsx")
# load specified sheet
my_xlsx_data <- read_excel("data/mydata.xlsx", sheet = "PICK_ME_FIRST!")
my_xlsx_data
# slide 28 ----------------------------------------------------------------
# 1: Read in the CustomerData.csv file and save as customers
# 2: What spreadsheets are contained in the CustomerData.xlsx file?
# 3. Read in the spreadsheet that contains the data.
# slide 30 ----------------------------------------------------------------
# dimensions (rows x columns)
dim(customers)
# get a quick glimpse of the data
glimpse(customers)
# get the names of all the variables
names(customers)
# how many missing values exist
sum(is.na(customers))
# omit all observations with missing values
clean_data <- na.omit(customers)
# view the data in a spreadsheet like viewer
View(customers)
###############################################################################
# Transforming Data #
###############################################################################
# slide 34 ----------------------------------------------------------------
# packages required
library(nycflights13)
library(dplyr)
# data used in examples
flights
# slide 35 ----------------------------------------------------------------
# filter for all observations in month 1 (January)
filter(flights, month == 1)
# filter for all observations on January 1st
filter(flights, month == 1, day == 1)
# filter for all observations on January 1st with a departure delay
filter(flights, month == 1, day == 1, dep_delay > 0)
# slide 36 ----------------------------------------------------------------
# dplyr functions do not over-write data; must save to a new data frame object
dec25 <- filter(flights, month == 12, day == 25)
dec25
# slide 37 ----------------------------------------------------------------
# what will these produce?
filter(flights, month == 12)
filter(flights, month != 12)
filter(flights, month %in% c(11, 12))
filter(flights, arr_delay <= 120)
filter(flights, !(arr_delay <= 120))
filter(flights, is.na(tailnum))
# slide 38 ----------------------------------------------------------------
# set 1
filter(flights, month == 12, day == 25)
filter(flights, month == 12 & day == 25)
# set 2
filter(flights, month == 11 | month == 12)
filter(flights, month %in% c(11, 12))
# set 3 --> are these the same?
filter(flights, !(arr_delay > 120 | dep_delay > 120))
filter(flights, arr_delay <= 120, dep_delay <= 120)
# slide 39 ----------------------------------------------------------------
# 1: Import the CustomerData.csv file.
# 2: Filter for female customers only.
# 3: Filter for female customers that are greater than 45 years old and live in region 3.
# 4: Filter for female customers that are greater than 45 years old or live in region 3.
# slide 40 ----------------------------------------------------------------
# select variables of interest
select(flights, year, month, day)
select(flights, year:day)
# slide 41 ----------------------------------------------------------------
# helper functions
select(flights, starts_with("arr"))
select(flights, ends_with("time"))
select(flights, c(carrier, ends_with("time"), contains("arr")))
# slide 42 ----------------------------------------------------------------
# 1: Using the customer data, select all columns between CustomerID and Gender.
# 2: Now select all columns except those between and including CustomerID and Gender.
# 3: Select CustomerID and all variables that contain the word "Card".
# slide 43 ----------------------------------------------------------------
# reorder data
arrange(flights, dep_time)
arrange(flights, desc(dep_time))
# slide 45 ----------------------------------------------------------------
# 1: Select the variables CustomerID, Region, Gender, Age, HHIncome, CardSpendMonth
# and save this as sub_cust.
# 2: Order sub_cust data by Age and CardSpendMonth (ascending order)
# 3: Order sub_cust data by Age (oldest to youngest) and CardSpendMonth (least to most)
# slide 46 ----------------------------------------------------------------
# create smaller data set so we can see the results
flights_sml <- select(flights, ends_with("delay"), distance, air_time)
flights_sml
# create two new variables (gain, speed)
mutate(flights_sml,
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
)
# mutate creates variables in order so we can create
# variables from recently created variables
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
# slide 47 ----------------------------------------------------------------
# mean center data
transmute(flights, center_delay = dep_delay / mean(dep_delay, na.rm = TRUE))
# transform values
transmute(flights,
log_air_time = log2(air_time),
exp_delay = exp(dep_delay)
)
# lag and cumsum values
transmute(flights,
dep_delay = dep_delay,
lag_delay = lag(dep_delay),
sum_delay = cumsum(dep_delay)
)
# slide 48 ----------------------------------------------------------------
# Using your customers data...
# 1: Create a ratio variable that computes the ratio of CardSpendMonth to HHIncome
# 2: Create two variables:
# ratio1 = CardSpendMonth / HHIncome
# ratio2 = CardSpendMonth / Age
# slide 49 ----------------------------------------------------------------
# compute single summary statistic
summarize(flights, dep_delay_mean = mean(dep_delay, na.rm = TRUE))
# compute multiple summary statistics
summarize(flights,
dep_delay_mean = mean(dep_delay, na.rm = TRUE),
dep_delay_sd = sd(dep_delay, na.rm = TRUE),
n = n()
)
# slide 51 ----------------------------------------------------------------
# compute grouped summary statistics
# group by month then compute average departure delay
by_month <- group_by(flights, month)
summarize(by_month, avg_delay = mean(dep_delay, na.rm = TRUE))
# group by carrier then compute standard deviation of departure deplay
by_carrier <- group_by(flights, carrier)
summarize(by_carrier, delay_sd = sd(dep_delay, na.rm = TRUE))
# slide 53 ----------------------------------------------------------------
# 1: In our customers data, compute the average CardSpendMonth across all customers.
# 2: Now compute the average CardSpendMonth for each gender.
# 3: Now compute the average CardSpendMonth for each gender and region. Which
# gender and region have the highest average spend?
# slide 55 ----------------------------------------------------------------
# traditional approach
by_gdr_rgn <- group_by(customers, Gender, Region)
avg_gdr_rgn <- summarize(by_gdr_rgn, Avg_spend = mean(CardSpendMonth, na.rm = TRUE))
arrange(avg_gdr_rgn, desc(Avg_spend))
# pipe operator approach
sub_cust %>%
group_by(Gender, Region) %>%
summarize(Avg_spend = mean(CardSpendMonth, na.rm = TRUE)) %>%
arrange(desc(Avg_spend))
# slide 56 ----------------------------------------------------------------
# Using the pipe operator follow these steps with the customers data:
# 1: filter for male customers only
# 2: create a new variable: ratio = CardSpendMonth / HHIncome
# 3: group this data by age
# 4: compute the mean of the new ratio variable by age
# 5: sort this output to find the age with the highest ratio of expenditures to income.
###############################################################################
# Visualizing Data #
###############################################################################
# slide 61 ----------------------------------------------------------------
library(ggplot2)
mpg
# slide 62 ----------------------------------------------------------------
# create basic canvas for plot
ggplot(data = mpg)
ggplot(data = mpg, aes(x = displ, y = hwy))
# slides 64-65 ------------------------------------------------------------
# univariate geoms
ggplot(data = mpg, aes(x = hwy)) +
geom_histogram()
ggplot(data = mpg, aes(x = hwy)) +
geom_freqpoly()
ggplot(data = mpg, aes(x = hwy)) +
geom_density()
# bivariate geoms
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg, aes(x = class, y = hwy)) +
geom_violin()
# slide 66 ----------------------------------------------------------------
# Using the customers data:
# 1: Create a chart that illustrates the distribution of the DebtToIncomeRatio variable.
# 2: Create a chart that shows the counts for each JobCategory
# 3: Create a scatter plot of HHIncome vs CardSpendMonth
# slide 67 ----------------------------------------------------------------
# many non-mapping aesthetics (color, size, shape, transparency)
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue", size = 2, shape = 17, alpha = .5)
# slide 69 ----------------------------------------------------------------
# non-mapping color aesthetic
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point(color = "blue")
# mapping color aesthetic to class variable
ggplot(data = mpg, aes(x = displ, y = hwy, color = class)) +
geom_point()
# slide 70 ----------------------------------------------------------------
# 1: Create a scatter plot of HHIncome vs CardSpendMonth and color all points blue.
# 2: Create a scatter plot of HHIncome vs CardSpendMonth and color all points based
# on whether or not the customer is retired.
# slide 71 ----------------------------------------------------------------
# facet_wrap for single variable small multiples
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_wrap(~ class, nrow = 2)
# facet_grid for two variable small multiples
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_point() +
facet_grid(drv ~ cyl)
# slide 72 ----------------------------------------------------------------
# 1: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory.
# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and Gender.
# 3: Assess UnionMember across each JobCategory.
# slide 73 ----------------------------------------------------------------
# adding titles with ggtitle
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_jitter() +
ggtitle("Displacement vs Highway MPG",
subtitle = "Data from 1999 & 2008")
# adding titles with labs
ggplot(data = mpg, aes(x = displ, y = hwy)) +
geom_jitter() +
labs(
title = "Displacement vs Highway MPG",
subtitle = "Data from 1999 & 2008",
caption = "http://fueleconomy.gov"
)
# slide 74 ----------------------------------------------------------------
# adjusting x and y axis parameters
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_x_log10()
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .25) +
scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma)
# slide 75 ----------------------------------------------------------------
# putting it all together
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .15) +
scale_y_continuous(name = "Median Sales Price", labels = scales::dollar) +
scale_x_log10(name = "Total Sales Volume", labels = scales::comma) +
labs(
title = "Texas Housing Sales",
subtitle = "Sales data from 2000-2010 provided by the TAMU real estate center",
caption = " http://recenter.tamu.edu/"
)
# slide 76 ----------------------------------------------------------------
# 1: Remove all missing values from the customers data and then...
# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and...
# 3: add a title, subtitle, and nicely format the axes.
# slide 77 ----------------------------------------------------------------
# examples of overplotting
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .1) +
scale_x_log10() +
geom_smooth()
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .1) +
scale_x_log10() +
geom_smooth(method = "lm")
ggplot(data = txhousing, aes(x = volume, y = median)) +
geom_point(alpha = .1) +
scale_x_log10() +
geom_smooth(method = "lm") +
facet_wrap(~ month)
# slide 78 ----------------------------------------------------------------
# 1: Remove all missing values from the customer data and then…
# 2: Create a scatter plot of HHIncome vs CardSpendMonth facetted by JobCategory and…
# 3: add a title, subtitle, and nicely format the axes and…
# 4: add a linear line to assess if the slope changes across JobCategory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment