Last active
December 23, 2017 15:34
-
-
Save mhoehle/8b89a103c8681bcfc189d8c3eb3babda to your computer and use it in GitHub Desktop.
Small R script to extend analysis of https://www.reddit.com/r/dataisbeautiful/comments/7l9ef7/i_simulated_and_animated_500_instances_of_the/ using unequal occurrence probabilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ##################################################################### | |
| ## Small R script to extend analysis of | |
| ## https://www.reddit.com/r/dataisbeautiful/comments/7l9ef7/i_simulated_and_animated_500_instances_of_the/ | |
| ## using unequal occurrence probabilities. | |
| ## | |
| ## Author: Michael Höhle <http://www.math.su.se/~hoehle> | |
| ## Date: 2017-12-23 | |
| ## License: https://www.gnu.org/licenses/gpl-3.0.html | |
| ###################################################################### | |
| library(dplyr) | |
| library(magrittr) | |
| library(lubridate) | |
| library(ggplot2) | |
| ##Download data from https://github.com/fivethirtyeight/data/tree/master/births | |
| file <- tempfile() | |
| download.file("https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_1994-2003_CDC_NCHS.csv", destfile=file) | |
| ##Load birthday file | |
| bday <- readr::read_csv(file) | |
| ##Add columns containing date without year and the date | |
| bday %<>% mutate(date = as.Date(sprintf("%d-%02d-%02d",year,month,date_of_month)), | |
| date_in_year = sprintf("%02d-%02d",month,date_of_month)) | |
| ##Determine aggregated counts and proportions | |
| births <- bday %>% group_by(date_in_year) %>% summarise(births = sum(births), proportion= sum(births) / sum(bday %$% births)) | |
| ##Show distribution of the dates | |
| births %>% ggplot(aes(x=date_in_year, y=proportion)) + geom_point() | |
| ##Install package for computing birthday problem with unequal class probabilities | |
| devtools::install_github("hoehleatsu/birthdayproblem") | |
| library(birthdayproblem) | |
| ##Compute collision probabilities either by assuming all 366 days are | |
| ##equally probable or by taking the empirical probabilities for each | |
| ##day. We could use the exact computation using method "Rcpp" or the approximation | |
| ##by Mase (1992) | |
| df <- data.frame(n=2:50) %>% rowwise %>% | |
| mutate(`equal occurr. prob` = pbirthday(n=n,classes=nrow(births),coincident=2), | |
| `unequal occurr. prob`= pbirthday_up(n=as.integer(n), prob=births %$% proportion, method="mase1992")$prob) | |
| ##Format to long for visualization with ggplot | |
| df_long <- df %>% tidyr::gather(key="Assumption", value="probability", -n) | |
| ##Visualize on the probability scale | |
| ggplot(df_long, aes(x=n, y=probability, color=Assumption)) + geom_line() + xlab("Number of People in Room") + ylab("Probability of at least one birthday collision") + scale_y_continuous(labels=scales::percent) + ggtitle("Birthday paradox", subtitle="366 days with equal probability vs. 366 days with probabilities by US National Center for Health Statistics data (1994-2003)") + theme(legend.position="bottom") + scale_color_discrete(name="Assumption:") | |
| ggsave(filename="birthday.png", width=9, height=4, dpi=300) | |
| ##Visualize on the logit scale | |
| ggplot(df_long, aes(x=n, y=qlogis(probability), color=Assumption)) + geom_line() + xlab("Number of People in Room") + ylab("logit(Probability of at least one birthday collision)") + ggtitle("Birthday paradox", subtitle="366 days with equal probability vs. 366 days with probabilities by US National Center for Health Statistics data (1994-2003)") + theme(legend.position="bottom") + scale_color_discrete(name="Assumption:") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment