Created
October 27, 2025 20:45
-
-
Save naomispence/1d47016d4e16502a90cf33f21c277e6f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(ggplot2) | |
| library(dplyr) | |
| library(lsr) | |
| library(descr) | |
| library(Hmisc) | |
| library('lehmansociology') | |
| options(scipen = 999) | |
| data(wave5addhealth) | |
| ########DATA MANAGEMENT####### | |
| #IMPORTANT: to use data management example code, you must understand what changes are needed for YOUR variable. | |
| #change VARNAME and variable values/dummy codes as needed. | |
| #if you need to recode any of your variable values, you should do it before you label your dummy codes | |
| wave5addhealth$VARNAME[wave5addhealth$VARNAME == 6] <- 5 | |
| #the line right above this note changes the old dummy code 6 to a dummy code of 5, which combines the two groups. | |
| #if you need to code out missing data (R calls it NA), you would use the example shown in the line below. | |
| wave5addhealth$VARNAME[wave5addhealth$VARNAME == 97] <- NA | |
| #in the line above, the number inside the ] needs to be the dummy code for missing data on your variable. | |
| # the two lines below tell R to treat your categorical variable as a categorical variable (factor) and label the dummy codes | |
| wave5addhealth$VARNAME <- factor(wave5addhealth$VARNAME) | |
| levels(wave5addhealth$VARNAME) <- c("label first dummy code", "label second dummy code", "label as many as you have") | |
| #on the line above this note, your labels are in quotes and they must go in the order of the dummy codes. | |
| #the line below will label your variable (like making a title) | |
| label(wave5addhealth$VARNAME) <- "Label that tells the audience what this variable says about people" | |
| ########FREQUENCY DISTRIBUTIONS####### | |
| ##CHANGE VARNAME TO YOUR VARIABLE NAME; CHANGE TITLES AND X-AXIS LABELS | |
| freq(wave5addhealth$VARNAME) | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| ##You can use the line below here if you have a quantitative variable that you need cumulative percent for. | |
| frequency(wave5addhealth$VARNAME, cumulative.percent=TRUE, title="YOUR VARIABLE DESCRIPTION") | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| ########UNIVARIATE GRAPHS####### | |
| #be sure to use the appropriate graph for variable type; change VARNAME, colors, titles, and x-axis labels | |
| ggplot(data=subset(wave5addhealth, !is.na(VARNAME)), aes(x = VARNAME)) + | |
| geom_bar(color="blue", fill="yellow", aes(y = ((..count..)/sum(..count..)))) + | |
| scale_y_continuous(labels = scales::percent) + | |
| ggtitle("Bar Graph of of YOUR VARIABLE DESCRIPTION, Wave 5 Add Health") + | |
| labs(y="Percent", x="LABEL FOR THE CATEGORIES") + | |
| theme(axis.text.x=element_text(angle=-25)) | |
| #INTERPRET THE RESULT OF THE ABOVE | |
| ggplot(data = wave5addhealth, aes(x = VARNAME)) + | |
| geom_histogram(color="blue", fill="green", binwidth =1, aes(y=(..count../sum(..count..))*100)) + | |
| ggtitle("Distribution of YOUR VARIABLE DESCRIPTION, Add Health Wave 5") + | |
| labs(y="Percent", x="UNIT OF MEASUREMENT") | |
| #INTERPRET THE RESULT OF THE ABOVE | |
| ########DESCRIPTIVE STATISTICS####### | |
| MODE(wave5addhealth$VARNAME) | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| median(as.numeric(wave5addhealth$VARNAME), na.rm=TRUE) | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| summary(wave5addhealth$VARNAME, na.rm=TRUE) | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| sd(wave5addhealth$VARNAME, na.rm=TRUE) | |
| #INTERPRET THE RESULT OF THE LINE ABOVE | |
| ########BIVARIATE TABLE/GRAPH####### | |
| ##BAR GRAPH FOR QUANTITATIVE DEPENDENT VARIABLE AND CATEGORICAL INDEPENDENT VARIABLE (from chapter 7) | |
| #change both variable names (y is dependent and x is independent), both axis labels, and title. | |
| ggplot(data=subset(wave5addhealth, !is.na(H5HR2)))+stat_summary(aes(x=H5HR2,y=H5ID23),fun.y=mean,geom="bar")+ | |
| ylab("Average Hours Per Week")+ | |
| xlab("Current Living Arrangements")+ | |
| ggtitle("Bar Graph of Average Time Spent Watching TV/Movies/Videos by Living Arrangements") | |
| #Interpretation: The graph shows that Add Health respondents who live in their own | |
| #home watch about 13 hours of TV, movies, and videos per week. The highest average | |
| #time spent watching TV is among those living in their parents' home or another | |
| #person's home; these group average about 17.5 hours per week. | |
| ##BIVARIATE TABLE (CROSSTAB) FOR 2 CATEGORICAL VARIABLES (not in chapter 7) | |
| #NOTE THAT YOU NEED TO PUT YOUR DEPENDENT VARIABLE FIRST; dependent ~ independent | |
| #The order that you list variables in a crosstab is critical for ensuring that you're | |
| #correctly interpreting the results. We "percent down, compare across" to see group | |
| #differences in the dependent variable by groups of the independent variable. | |
| lehmansociology::crosstab(H5HR2 ~ H5OD2A, data = wave5addhealth, | |
| title = "Living Arrangements by Sex Assigned at Birth", | |
| format= "column_percent") | |
| #Interpretation: 85% of males live in their own place, compared to 89% of females. | |
| #A higher percent of those who were assigned the male sex at birth (9.1%) live with | |
| #their parents as adults who are in their 30s or early 40s, compared to 6.4% of females. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment