Last active
June 15, 2020 15:08
-
-
Save lucacarbonelc/d9cfd6080dac4d822c06b8233578b95c to your computer and use it in GitHub Desktop.
link chords to lyrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(chorrrds) | |
| library(tidyverse) | |
| # Functions ---------------------------------------------------------------- | |
| # Function to get rid of spaces | |
| strip_it <- function(x){ | |
| ch_vector <- strsplit(x, "")[[1]] | |
| ch_vector[c(grep("\\s", ch_vector)) ] <- "" | |
| indx <- sapply(ch_vector, nchar) | |
| return(indx) | |
| } | |
| # Function to find the min | |
| findMin <- function(chord, lyric, k){ | |
| # find closest value to chord | |
| chs_note <- str_split(gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", chord, perl=TRUE), " ")[[1]] | |
| pos_chr <- which(strsplit(chord, " ")[[1]] == chs_note[k]) | |
| pos_txt <- append(0, diff(strip_it(lyric))) | |
| pos_txt[pos_txt == -1] <- 0 | |
| pos_min <- which(pos_txt == 1) | |
| minim <- ifelse(pos_chr > pos_min, | |
| which.min(abs(pos_chr - pos_min)), | |
| which.min(abs(pos_min - pos_chr))) | |
| # find whole word that starts at minim | |
| pos_txt2 <- strip_it(lyric) | |
| ch_vector <- strsplit(lyric, "")[[1]] | |
| ch_vector[c(grep("\\s", ch_vector)) ] <- "" | |
| indx <- as.data.frame(cbind(V1 = ch_vector, | |
| V2 =sapply(ch_vector, nchar))) | |
| rownames(indx) <- NULL | |
| rownames(indx) <- as.numeric(rownames(indx)) | |
| complete <- paste(as.character(indx[pos_min[minim]: | |
| ifelse(length(chs_note) > k, | |
| which(strsplit(chord, " ")[[1]] == chs_note[k+1]), | |
| nrow(indx)) | |
| , 1]), collapse = "-") | |
| complete1 <- gsub("\\-\\-", " ", complete) | |
| return( | |
| gsub("\\-", "", complete1, perl=TRUE) | |
| ) | |
| } | |
| # Function for polishing data frame for single song | |
| cleanUp <- function(x){ | |
| chords_lyrics <- rvest::html_nodes(x, "pre") %>% | |
| rvest::html_text() | |
| chords_lyrics <- | |
| chords_lyrics %>% | |
| str_remove_all(pattern = "[0-9]|[-][0-9][-]|\\||[0-9][br]|~") | |
| chords <- tibble(V1 = sapply(chords_lyrics, function(x) strsplit(x, "\n")[[1]], USE.NAMES=FALSE)) | |
| chords.dat <- as.data.frame(cbind(rep(NA, nrow(chords)/2), | |
| rep(NA, nrow(chords)/2))) | |
| for (i in 1:nrow(chords)) { | |
| ifelse(any(unlist(str_split(as.character(chords[i, 1]), " ")) %in% all_notes) == TRUE, | |
| chords.dat[i, 1] <- as.character(chords[i, 1]), | |
| chords.dat[i, 2] <- as.character(chords[i, 1]) | |
| ) | |
| } | |
| chords.dat <- chords.dat[ grep("Intro:", chords.dat$V1, invert = TRUE), ] | |
| chords.dat <- chords.dat[ grep("--", chords.dat$V2, invert = TRUE), ] | |
| chords.dat[,1] <- sub("( *)(\\w+)", "\\2\\1", chords.dat[,1]) # put all the first chords at the beginning of the verse (keeping he same spaces between more than two chords) | |
| chords.dat <- chords.dat %>% | |
| mutate_at(c("V2"), funs(lead), n = 1 ) | |
| chords.dat <- chords.dat[complete.cases(chords.dat), ] | |
| for (i in 1:nrow(chords.dat)) { # take only chords that are linked to some lyrics | |
| if( nchar(chords.dat[i, 1]) > nchar(chords.dat[i, 2]) ) { | |
| nch <- nchar(chords.dat[i, 1]) - nchar(chords.dat[i, 2]) | |
| chords.dat[i, 1] <- str_sub(chords.dat[i, 1], 1, str_length(chords.dat[i, 1])-nch) | |
| } | |
| } | |
| chords.dat <- chords.dat[!apply(chords.dat == "", 1, all), ] | |
| chords.dat$V2 <- trimws(chords.dat$V2, "l") | |
| chords.dat$V2 <- paste(" ", chords.dat$V2) | |
| if(is_empty(which(chords.dat[, 2] == " ")) == FALSE){ # remove occasional blank rows (e.g., chords.dat[[9]]) | |
| chords.dat <- chords.dat[-which(chords.dat[, 2] == " "), ] | |
| } | |
| rownames(chords.dat) <- NULL | |
| rownames(chords.dat) <- as.numeric(rownames(chords.dat)) | |
| return(chords.dat) | |
| } | |
| # Function to eliminate remaining words | |
| eliminate_words <- function(first_sentence, second_sentence){ | |
| if(is.na(second_sentence)) return(first_sentence) | |
| first_word <- word(second_sentence, 1) | |
| characters_first_word <- strsplit(first_word, split = "")[[1]] | |
| # Get all possible words that could have remained in the previous | |
| # sentence | |
| words_list <- vector() | |
| for(i in 1:length(characters_first_word)) { | |
| words_list[i] <- paste(characters_first_word[1:i], collapse = "") | |
| } | |
| # Eliminate them | |
| first_sentence <- str_split(first_sentence, " ")[[1]] | |
| if(first_sentence[length(first_sentence)] %in% words_list){ | |
| first_sentence <- first_sentence[-length(first_sentence)] | |
| } | |
| first_sentence <- paste0(first_sentence, collapse = " ") | |
| return(first_sentence) | |
| } | |
| # Function for matching lyrics with chords | |
| createNet <- function(chords.dat){ | |
| chords.net <- as.data.frame( | |
| bind_cols( | |
| chord = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))), | |
| lyric = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))) | |
| )) | |
| # chords | |
| for( i in 1:nrow(chords.dat) ) { | |
| for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) { | |
| if( str_count(chords.dat[i, 1], "\\S+") > 1 ){ | |
| for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) { | |
| chords.net[ match(as.numeric(do.call(paste0, expand.grid(i,j))), | |
| sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- substring( chords.dat[i, 1], # find chords in verse i | |
| sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j], # from this position | |
| sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j]+1 ) # until this position | |
| } | |
| } else { chords.net[match(as.numeric(do.call(paste0, expand.grid(i,j))), | |
| sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- chords.dat[i, 1] } | |
| } | |
| } | |
| # lyrics | |
| for( i in 1:nrow(chords.dat) ){ | |
| for ( k in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ){ | |
| chords.net[match(as.numeric(do.call(paste0, expand.grid(i,k))), | |
| sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 2 ] <- suppressWarnings(findMin(chords.dat$V1[i], chords.dat$V2[i], k)) | |
| } | |
| } | |
| chords.net <- chords.net[complete.cases(chords.net$chord), ] | |
| rownames(chords.net) <- NULL | |
| rownames(chords.net) <- as.numeric(rownames(chords.net)) | |
| chords.net <- chords.net %>% | |
| mutate( | |
| second_sentence = lead(lyric, n = 1)) %>% | |
| rowwise() %>% | |
| mutate( | |
| lyric = | |
| eliminate_words(first_sentence = lyric, | |
| second_sentence = second_sentence)) %>% | |
| select(-second_sentence) | |
| return(chords.net) | |
| } | |
| # Chords ------------------------------------------------------------------ | |
| notes = c('A','B','C','D','E','F','G') | |
| flats = 'b' | |
| minor = 'm' | |
| sharps = '#' | |
| all_notes = c(notes, | |
| paste0(notes, flats), | |
| paste0(notes, sharps), | |
| paste0(notes, minor), | |
| paste0(notes, flats, sharps), | |
| paste0(notes, minor, sharps), | |
| paste0(notes, flats, minor), | |
| paste0(notes, flats, sharps, minor)) | |
| # Example ----------------------------------------------------------------- | |
| url <- get_songs("The Weeknd") | |
| url_songs <- url$url[2] | |
| x <- xml2::read_html(paste0("https://www.cifraclub.com.br", | |
| url_songs)) | |
| # chords.dat | |
| chords.dat <- cleanUp(x) | |
| # chords.net | |
| chords.net <- createNet(chords.dat) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment