Skip to content

Instantly share code, notes, and snippets.

@naomispence
Created October 27, 2025 20:45
Show Gist options
  • Select an option

  • Save naomispence/1d47016d4e16502a90cf33f21c277e6f to your computer and use it in GitHub Desktop.

Select an option

Save naomispence/1d47016d4e16502a90cf33f21c277e6f to your computer and use it in GitHub Desktop.
library(ggplot2)
library(dplyr)
library(lsr)
library(descr)
library(Hmisc)
library('lehmansociology')
options(scipen = 999)
data(wave5addhealth)
########DATA MANAGEMENT#######
#IMPORTANT: to use data management example code, you must understand what changes are needed for YOUR variable.
#change VARNAME and variable values/dummy codes as needed.
#if you need to recode any of your variable values, you should do it before you label your dummy codes
wave5addhealth$VARNAME[wave5addhealth$VARNAME == 6] <- 5
#the line right above this note changes the old dummy code 6 to a dummy code of 5, which combines the two groups.
#if you need to code out missing data (R calls it NA), you would use the example shown in the line below.
wave5addhealth$VARNAME[wave5addhealth$VARNAME == 97] <- NA
#in the line above, the number inside the ] needs to be the dummy code for missing data on your variable.
# the two lines below tell R to treat your categorical variable as a categorical variable (factor) and label the dummy codes
wave5addhealth$VARNAME <- factor(wave5addhealth$VARNAME)
levels(wave5addhealth$VARNAME) <- c("label first dummy code", "label second dummy code", "label as many as you have")
#on the line above this note, your labels are in quotes and they must go in the order of the dummy codes.
#the line below will label your variable (like making a title)
label(wave5addhealth$VARNAME) <- "Label that tells the audience what this variable says about people"
########FREQUENCY DISTRIBUTIONS#######
##CHANGE VARNAME TO YOUR VARIABLE NAME; CHANGE TITLES AND X-AXIS LABELS
freq(wave5addhealth$VARNAME)
#INTERPRET THE RESULT OF THE LINE ABOVE
##You can use the line below here if you have a quantitative variable that you need cumulative percent for.
frequency(wave5addhealth$VARNAME, cumulative.percent=TRUE, title="YOUR VARIABLE DESCRIPTION")
#INTERPRET THE RESULT OF THE LINE ABOVE
########UNIVARIATE GRAPHS#######
#be sure to use the appropriate graph for variable type; change VARNAME, colors, titles, and x-axis labels
ggplot(data=subset(wave5addhealth, !is.na(VARNAME)), aes(x = VARNAME)) +
geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) +
scale_y_continuous(labels = scales::percent) +
ggtitle("Bar Graph of of YOUR VARIABLE DESCRIPTION, Wave 5 Add Health") +
labs(y="Percent", x="LABEL FOR THE CATEGORIES") +
theme(axis.text.x=element_text(angle=-25))
#INTERPRET THE RESULT OF THE ABOVE
ggplot(data = wave5addhealth, aes(x = VARNAME)) +
geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) +
ggtitle("Distribution of YOUR VARIABLE DESCRIPTION, Add Health Wave 5") +
labs(y="Percent", x="UNIT OF MEASUREMENT")
#INTERPRET THE RESULT OF THE ABOVE
########DESCRIPTIVE STATISTICS#######
MODE(wave5addhealth$VARNAME)
#INTERPRET THE RESULT OF THE LINE ABOVE
median(as.numeric(wave5addhealth$VARNAME), na.rm=TRUE)
#INTERPRET THE RESULT OF THE LINE ABOVE
summary(wave5addhealth$VARNAME, na.rm=TRUE)
#INTERPRET THE RESULT OF THE LINE ABOVE
sd(wave5addhealth$VARNAME, na.rm=TRUE)
#INTERPRET THE RESULT OF THE LINE ABOVE
########BIVARIATE TABLE/GRAPH#######
##BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE (from chapter 7)
#change both variable names (y is dependent and x is independent), both axis labels, and title.
ggplot(data=subset(wave5addhealth, !is.na(H5HR2)))+stat_summary(aes(x=H5HR2,y=H5ID23),fun.y=mean,geom="bar")+
ylab("Average Hours Per Week")+
xlab("Current Living Arrangements")+
ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements")
#Interpretation: The graph shows that Add Health respondents who live in their own
#home watch about 13 hours of TV, movies, and videos per week. The highest average
#time spent watching TV is among those living in their parents' home or another
#person's home; these group average about 17.5 hours per week.
##BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES (not in chapter 7)
#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
#The order that you list variables in a crosstab is critical for ensuring that you're
#correctly interpreting the results. We "percent down, compare across" to see group
#differences in the dependent variable by groups of the independent variable.
lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth,
title = "Living Arrangements by Sex Assigned at Birth",
format= "column_percent")
#Interpretation: 85% of males live in their own place, compared to 89% of females.
#A higher percent of those who were assigned the male sex at birth (9.1%) live with
#their parents as adults who are in their 30s or early 40s, compared to 6.4% of females.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment