Last active
February 23, 2024 00:19
-
-
Save k5cents/f52a38286dfea494996db8b7f5338a4c to your computer and use it in GitHub Desktop.
Create some plots from Spotify streaming history data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| library(jsonlite) | |
| library(scales) | |
| library(fs) | |
| library(k5) | |
| # functions --------------------------------------------------------------- | |
| most_common <- function (x, n = 6) { | |
| as.vector(na.omit(names(sort(table(x), decreasing = TRUE)[1:n]))) | |
| } | |
| is_even <- function (x) { | |
| x %% 2 == 0 | |
| } | |
| # read and format --------------------------------------------------------- | |
| # map all json files into one data frame | |
| x <- map_df( | |
| .x = dir_ls("~/Downloads/my_spotify_data/MyData/", regexp = "endsong_\\d"), | |
| .f = read_json, | |
| simplifyDataFrame = TRUE | |
| ) | |
| x <- as_tibble(x) | |
| x$ts <- as_datetime(x$ts, tz = "EST") | |
| # remove rows and columns with podcast data | |
| x <- x %>% | |
| filter(is.na(episode_name)) %>% | |
| select(-contains("episode")) | |
| x <- x %>% | |
| filter(!is.na(master_metadata_track_name)) | |
| # format and select columns | |
| x <- x %>% | |
| mutate(min = ms_played / 6e4) %>% | |
| select( | |
| ts, | |
| min, | |
| platform, | |
| track = master_metadata_track_name, | |
| album = master_metadata_album_album_name, | |
| artist = master_metadata_album_artist_name, | |
| reason_start, | |
| reason_end | |
| ) | |
| # clean up platform | |
| x <- x %>% | |
| mutate( | |
| platform = platform %>% | |
| str_to_lower() %>% | |
| str_remove("partner windows_tv microsoft;") %>% | |
| str_remove("_") %>% | |
| str_remove("[:punct:].*$") %>% | |
| str_replace("os x", "osx") %>% | |
| word(1) %>% | |
| str_replace("xboxone", "xbox") %>% | |
| str_replace("xboxseries", "xbox") %>% | |
| str_replace("partner", "cast") | |
| ) | |
| # artist bar -------------------------------------------------------------- | |
| x %>% | |
| group_by(artist) %>% | |
| summarise(hours = sum(min) / 60) %>% | |
| arrange(desc(hours)) %>% | |
| head(20) %>% | |
| ggplot(aes(x = reorder(artist, hours), y = hours)) + | |
| geom_col(color = "black", aes(fill = hours)) + | |
| scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
| coord_flip() + | |
| labs(x = "Artist", y = "Hours") + | |
| theme_classic() | |
| x %>% | |
| group_by(artist) %>% | |
| filter() %>% | |
| summarise(min = sum(min), n = n()) %>% | |
| arrange(desc(n)) %>% | |
| head(100) %>% | |
| write_csv("~/Documents/spotify_artists.csv", na = "") | |
| # artist month ------------------------------------------------------------ | |
| x %>% | |
| group_by(month = month(ts)) %>% | |
| # select top N per month | |
| filter( | |
| artist %in% most_common(artist, 3), | |
| year(ts) == max(year(ts)) - 1 | |
| ) %>% | |
| # add Y for labels and bars | |
| group_by(month, artist) %>% | |
| summarise( | |
| total_min = sum(min), | |
| .groups = "drop_last" | |
| ) %>% | |
| mutate( | |
| lbl_y = cumsum(total_min), | |
| lbl_abb = if_else( | |
| total_min < 100, "", abbreviate(artist) | |
| ) | |
| ) %>% | |
| ggplot( | |
| mapping = aes(x = month, y = total_min) | |
| ) + | |
| geom_col( | |
| mapping = aes(fill = str_trunc(artist, width = 25)), | |
| color = "black", | |
| position = position_stack(reverse = TRUE) | |
| ) + | |
| geom_text( | |
| mapping = aes(y = lbl_y, label = lbl_abb), | |
| vjust = 1.5, | |
| color = "black" | |
| ) + | |
| scale_x_continuous( | |
| breaks = 1:12, | |
| labels = month.abb | |
| ) + | |
| scale_y_continuous( | |
| labels = comma | |
| ) + | |
| labs( | |
| title = "Most listened to artists each month", | |
| fill = "Artist", | |
| x = "Month", | |
| y = "Minutes" | |
| ) + | |
| theme_classic() | |
| # album bar --------------------------------------------------------------- | |
| x %>% | |
| group_by(album, artist) %>% | |
| filter(album %out% c("")) %>% | |
| summarise(hours = sum(min) / 60) %>% | |
| arrange(desc(hours)) %>% | |
| head(20) %>% | |
| mutate( | |
| lbl = paste( | |
| str_trunc(album, 30, side = "right"), | |
| str_trunc(artist, 30, side = "right"), | |
| sep = " | " | |
| ) | |
| ) %>% | |
| ggplot(aes(x = reorder(lbl, hours), y = hours)) + | |
| geom_col(color = "black", aes(fill = hours)) + | |
| scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
| coord_flip() + | |
| labs(x = "Artist", y = "Hours") + | |
| theme_classic() | |
| x %>% | |
| mutate( | |
| album = str_to_title(str_remove(album, "\\s\\(.*\\)$")), | |
| ) %>% | |
| group_by(album, artist) %>% | |
| filter() %>% | |
| summarise(min = sum(min), n = n()) %>% | |
| arrange(desc(n)) %>% | |
| head(100) %>% | |
| write_csv("~/Documents/spotify_albums.csv", na = "") | |
| # song bar ---------------------------------------------------------------- | |
| x %>% | |
| group_by(track, artist) %>% | |
| filter(track %out% c("")) %>% | |
| summarise(hours = sum(min) / 60) %>% | |
| arrange(desc(hours)) %>% | |
| head(20) %>% | |
| mutate( | |
| lbl = paste( | |
| str_trunc(track, 30, side = "right"), | |
| str_trunc(artist, 30, side = "right"), | |
| sep = " | " | |
| ) | |
| ) %>% | |
| ggplot(aes(x = reorder(lbl, hours), y = hours)) + | |
| geom_col(color = "black", aes(fill = hours)) + | |
| scale_fill_viridis_c(option = "B", guide = "none", end = 0.8) + | |
| coord_flip() + | |
| labs(x = "Artist", y = "Hours") + | |
| theme_classic() | |
| x %>% | |
| mutate( | |
| album = str_to_title(str_remove(album, "\\s\\(.*\\)$")), | |
| track = str_to_title(track) | |
| ) %>% | |
| group_by(track, album, artist) %>% | |
| filter() %>% | |
| summarise(min = sum(min), n = n()) %>% | |
| arrange(desc(n)) %>% | |
| head(100) %>% | |
| write_csv("~/Documents/spotify_songs.csv", na = "") | |
| # platform ---------------------------------------------------------------- | |
| x %>% | |
| group_by(platform, yr = year(ts)) %>% | |
| summarise(hours = sum(min) / 60) %>% | |
| ggplot(aes(x = yr, y = hours)) + | |
| geom_col(color = "black", aes(fill = platform)) + | |
| labs(x = "Year", y = "Hours") + | |
| theme_classic() | |
| x %>% | |
| group_by(yr = year(ts), platform) %>% | |
| summarise(hours = sum(min) / 60) %>% | |
| mutate(p = hours / sum(hours)) %>% | |
| ggplot(aes(x = yr, y = p)) + | |
| geom_col(color = "black", aes(fill = platform)) + | |
| labs(x = "Year", y = "Hours") + | |
| theme_classic() | |
| # time polar -------------------------------------------------------------- | |
| x %>% | |
| mutate( | |
| hour = hour(ts), | |
| year = year(ts), | |
| ) %>% | |
| group_by(year, hour) %>% | |
| summarise(totalMin = sum(min)) %>% | |
| filter(year > 2014) %>% | |
| # complete(hour = 0:23) %>% | |
| ggplot(aes(x = hour, y = totalMin)) + | |
| geom_col( | |
| mapping = aes(fill = totalMin), | |
| color = "black" | |
| ) + | |
| scale_fill_viridis_c( | |
| end = 0.90, | |
| option = "B", | |
| guide = "none" | |
| ) + | |
| scale_x_continuous( | |
| breaks = 0:23, | |
| minor_breaks = NULL, | |
| # labels = function(x) { | |
| # format(as.POSIXct(as.character(x), format = "%H"), format = "%I %p") | |
| # } | |
| ) + | |
| scale_y_continuous( | |
| labels = label_comma(), | |
| n.breaks = 10 | |
| ) + | |
| # coord_polar(start = 0) + | |
| labs( | |
| title = "Listening time by hour of the day", | |
| x = "Hour", | |
| y = "Minutes" | |
| ) + | |
| theme_classic() + | |
| theme(axis.text.y = element_blank()) + | |
| facet_wrap(~year, ncol = 1) | |
| # cumulative time --------------------------------------------------------- | |
| y <- x %>% | |
| filter( | |
| artist %in% most_common(artist, 5), | |
| year(ts) == max(year(ts)) - 1 | |
| ) %>% | |
| group_by(artist, wk = week(ts)) %>% | |
| summarise(wk_min = sum(min)) %>% | |
| mutate( | |
| cum_min = cumsum(wk_min), | |
| lbl_abb = abbreviate(artist) | |
| ) | |
| y %>% | |
| ggplot( | |
| mapping = aes(x = wk, y = cum_min) | |
| ) + | |
| geom_step( | |
| mapping = aes(color = artist) | |
| ) + | |
| geom_text( | |
| data = filter(y, cum_min == max(cum_min)), | |
| x = max(y$wk), | |
| nudge_y = max(y$cum_min) * 0.02, | |
| mapping = aes( | |
| label = artist, | |
| color = artist, | |
| x = wk | |
| ) | |
| ) + | |
| scale_y_continuous( | |
| labels = comma | |
| ) + | |
| scale_color_discrete( | |
| guide = "none" | |
| ) + | |
| labs( | |
| title = "Cumulative listening time over year", | |
| color = "Artist", | |
| x = "Week", | |
| y = "Minutes" | |
| ) + | |
| theme_classic() | |
| # calendar ---------------------------------------------------------------- | |
| x %>% | |
| group_by(date = as_date(ts)) %>% | |
| summarise(min_total = sum(min)) %>% | |
| mutate( | |
| year = year(date), | |
| month = month(date, label = TRUE, abbr = TRUE), | |
| wday = fct_relevel( | |
| .f = wday(date, label = TRUE, abbr = TRUE), | |
| c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun") | |
| ), | |
| day = day(date), | |
| wk = format(date, "%W"), | |
| ### cap outlier | |
| min_total = ifelse(min_total > 500, 500, min_total) | |
| ) %>% | |
| filter(year > 2014, year < 2023) %>% | |
| ggplot( | |
| mapping = aes(x = wk, y = wday) | |
| ) + | |
| geom_tile( | |
| mapping = aes(fill = min_total), | |
| color = "black", | |
| linewidth = 0.5 | |
| ) + | |
| coord_equal() + | |
| scale_fill_viridis_c( | |
| end = 0.95, | |
| option = "B", | |
| na.value = "black" | |
| ) + | |
| labs( | |
| title = "Listened time each day of the year", | |
| x = "Week", | |
| y = "Weekday", | |
| fill = "Minutes" | |
| ) + | |
| scale_x_discrete( | |
| labels = if_else(is_even(0:53), "", as.character(0:53)) | |
| ) + | |
| theme_classic() + | |
| theme(panel.background = element_rect(fill = "black")) + | |
| facet_wrap(~year, ncol = 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment