naomispence · October 27, 2025 20:45
diff --git a/complete_example_code_units1-3 b/complete_example_code_units1-3
 library(ggplot2)
 library(dplyr)
 library(lsr)
 library(descr)
 library(Hmisc)
 library('lehmansociology')
 options(scipen = 999)
 data(wave5addhealth)

 ########DATA MANAGEMENT#######
 #IMPORTANT: to use data management example code, you must understand what changes are needed for YOUR variable.
 #change VARNAME and variable values/dummy codes as needed.

 #if you need to recode any of your variable values, you should do it before you label your dummy codes
 wave5addhealth$VARNAME[wave5addhealth$VARNAME == 6] <- 5
 #the line right above this note changes the old dummy code 6 to a dummy code of 5, which combines the two groups.

 #if you need to code out missing data (R calls it NA), you would use the example shown in the line below.
 wave5addhealth$VARNAME[wave5addhealth$VARNAME == 97] <- NA
 #in the line above, the number inside the ] needs to be the dummy code for missing data on your variable.

 # the two lines below tell R to treat your categorical variable as a categorical variable (factor) and label the dummy codes
 wave5addhealth$VARNAME <- factor(wave5addhealth$VARNAME)
 levels(wave5addhealth$VARNAME) <- c("label first dummy code", "label second dummy code", "label as many as you have")
 #on the line above this note, your labels are in quotes and they must go in the order of the dummy codes.

 #the line below will label your variable (like making a title)
 label(wave5addhealth$VARNAME) <- "Label that tells the audience what this variable says about people"

 ########FREQUENCY DISTRIBUTIONS#######
 ##CHANGE VARNAME TO YOUR VARIABLE NAME; CHANGE TITLES AND X-AXIS LABELS
 freq(wave5addhealth$VARNAME)
 #INTERPRET THE RESULT OF THE LINE ABOVE

 ##You can use the line below here if you have a quantitative variable that you need cumulative percent for.
 frequency(wave5addhealth$VARNAME, cumulative.percent=TRUE, title="YOUR VARIABLE DESCRIPTION")
 #INTERPRET THE RESULT OF THE LINE ABOVE


 ########UNIVARIATE GRAPHS#######
 #be sure to use the appropriate graph for variable type; change VARNAME, colors, titles, and x-axis labels
 ggplot(data=subset(wave5addhealth, !is.na(VARNAME)), aes(x = VARNAME)) + 
  geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) + 
  scale_y_continuous(labels = scales::percent) + 
  ggtitle("Bar Graph of of YOUR VARIABLE DESCRIPTION, Wave 5 Add Health") + 
  labs(y="Percent", x="LABEL FOR THE CATEGORIES") +
  theme(axis.text.x=element_text(angle=-25))
 #INTERPRET THE RESULT OF THE ABOVE

 ggplot(data = wave5addhealth, aes(x = VARNAME)) + 
  geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) + 
  ggtitle("Distribution of YOUR VARIABLE DESCRIPTION, Add Health Wave 5") + 
  labs(y="Percent", x="UNIT OF MEASUREMENT")
 #INTERPRET THE RESULT OF THE ABOVE

 ########DESCRIPTIVE STATISTICS#######

 MODE(wave5addhealth$VARNAME)
 #INTERPRET THE RESULT OF THE LINE ABOVE

 median(as.numeric(wave5addhealth$VARNAME), na.rm=TRUE)
 #INTERPRET THE RESULT OF THE LINE ABOVE

 summary(wave5addhealth$VARNAME, na.rm=TRUE)
 #INTERPRET THE RESULT OF THE LINE ABOVE

 sd(wave5addhealth$VARNAME, na.rm=TRUE)
 #INTERPRET THE RESULT OF THE LINE ABOVE

 ########BIVARIATE TABLE/GRAPH#######

 ##BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE (from chapter 7)
 #change both variable names (y is dependent and x is independent), both axis labels, and title. 

 ggplot(data=subset(wave5addhealth, !is.na(H5HR2)))+stat_summary(aes(x=H5HR2,y=H5ID23),fun.y=mean,geom="bar")+
  ylab("Average Hours Per Week")+
  xlab("Current Living Arrangements")+ 
  ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements")
 #Interpretation: The graph shows that Add Health respondents who live in their own
 #home watch about 13 hours of TV, movies, and videos per week. The highest average
 #time spent watching TV is among those living in their parents' home or another
 #person's home; these group average about 17.5 hours per week. 


 ##BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES (not in chapter 7)
 #NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
 #The order that you list variables in a crosstab is critical for ensuring that you're 
 #correctly interpreting the results. We "percent down, compare across" to see group
 #differences in the dependent variable by groups of the independent variable.

 lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth, 
                          title = "Living Arrangements by Sex Assigned at Birth", 
                          format= "column_percent")
 #Interpretation: 85% of males live in their own place, compared to 89% of females. 
 #A higher percent of those who were assigned the male sex at birth (9.1%) live with 
 #their parents as adults who are in their 30s or early 40s, compared to 6.4% of females.
	library(ggplot2)
	library(dplyr)
	library(lsr)
	library(descr)
	library(Hmisc)
	library('lehmansociology')
	options(scipen = 999)
	data(wave5addhealth)

	########DATA MANAGEMENT#######
	#IMPORTANT: to use data management example code, you must understand what changes are needed for YOUR variable.
	#change VARNAME and variable values/dummy codes as needed.

	#if you need to recode any of your variable values, you should do it before you label your dummy codes
	wave5addhealth$VARNAME[wave5addhealth$VARNAME == 6] <- 5
	#the line right above this note changes the old dummy code 6 to a dummy code of 5, which combines the two groups.

	#if you need to code out missing data (R calls it NA), you would use the example shown in the line below.
	wave5addhealth$VARNAME[wave5addhealth$VARNAME == 97] <- NA
	#in the line above, the number inside the ] needs to be the dummy code for missing data on your variable.

	# the two lines below tell R to treat your categorical variable as a categorical variable (factor) and label the dummy codes
	wave5addhealth$VARNAME <- factor(wave5addhealth$VARNAME)
	levels(wave5addhealth$VARNAME) <- c("label first dummy code", "label second dummy code", "label as many as you have")
	#on the line above this note, your labels are in quotes and they must go in the order of the dummy codes.

	#the line below will label your variable (like making a title)
	label(wave5addhealth$VARNAME) <- "Label that tells the audience what this variable says about people"

	########FREQUENCY DISTRIBUTIONS#######
	##CHANGE VARNAME TO YOUR VARIABLE NAME; CHANGE TITLES AND X-AXIS LABELS
	freq(wave5addhealth$VARNAME)
	#INTERPRET THE RESULT OF THE LINE ABOVE

	##You can use the line below here if you have a quantitative variable that you need cumulative percent for.
	frequency(wave5addhealth$VARNAME, cumulative.percent=TRUE, title="YOUR VARIABLE DESCRIPTION")
	#INTERPRET THE RESULT OF THE LINE ABOVE


	########UNIVARIATE GRAPHS#######
	#be sure to use the appropriate graph for variable type; change VARNAME, colors, titles, and x-axis labels
	ggplot(data=subset(wave5addhealth, !is.na(VARNAME)), aes(x = VARNAME)) +
	geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) +
	scale_y_continuous(labels = scales::percent) +
	ggtitle("Bar Graph of of YOUR VARIABLE DESCRIPTION, Wave 5 Add Health") +
	labs(y="Percent", x="LABEL FOR THE CATEGORIES") +
	theme(axis.text.x=element_text(angle=-25))
	#INTERPRET THE RESULT OF THE ABOVE

	ggplot(data = wave5addhealth, aes(x = VARNAME)) +
	geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) +
	ggtitle("Distribution of YOUR VARIABLE DESCRIPTION, Add Health Wave 5") +
	labs(y="Percent", x="UNIT OF MEASUREMENT")
	#INTERPRET THE RESULT OF THE ABOVE

	########DESCRIPTIVE STATISTICS#######

	MODE(wave5addhealth$VARNAME)
	#INTERPRET THE RESULT OF THE LINE ABOVE

	median(as.numeric(wave5addhealth$VARNAME), na.rm=TRUE)
	#INTERPRET THE RESULT OF THE LINE ABOVE

	summary(wave5addhealth$VARNAME, na.rm=TRUE)
	#INTERPRET THE RESULT OF THE LINE ABOVE

	sd(wave5addhealth$VARNAME, na.rm=TRUE)
	#INTERPRET THE RESULT OF THE LINE ABOVE

	########BIVARIATE TABLE/GRAPH#######

	##BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE (from chapter 7)
	#change both variable names (y is dependent and x is independent), both axis labels, and title.

	ggplot(data=subset(wave5addhealth, !is.na(H5HR2)))+stat_summary(aes(x=H5HR2,y=H5ID23),fun.y=mean,geom="bar")+
	ylab("Average Hours Per Week")+
	xlab("Current Living Arrangements")+
	ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements")
	#Interpretation: The graph shows that Add Health respondents who live in their own
	#home watch about 13 hours of TV, movies, and videos per week. The highest average
	#time spent watching TV is among those living in their parents' home or another
	#person's home; these group average about 17.5 hours per week.


	##BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES (not in chapter 7)
	#NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent
	#The order that you list variables in a crosstab is critical for ensuring that you're
	#correctly interpreting the results. We "percent down, compare across" to see group
	#differences in the dependent variable by groups of the independent variable.

	lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth,
	title = "Living Arrangements by Sex Assigned at Birth",
	format= "column_percent")
	#Interpretation: 85% of males live in their own place, compared to 89% of females.
	#A higher percent of those who were assigned the male sex at birth (9.1%) live with
	#their parents as adults who are in their 30s or early 40s, compared to 6.4% of females.
No results found