Skip to content

Instantly share code, notes, and snippets.

@lucacarbonelc
Last active June 15, 2020 15:08
Show Gist options
  • Select an option

  • Save lucacarbonelc/d9cfd6080dac4d822c06b8233578b95c to your computer and use it in GitHub Desktop.

Select an option

Save lucacarbonelc/d9cfd6080dac4d822c06b8233578b95c to your computer and use it in GitHub Desktop.
link chords to lyrics
library(chorrrds)
library(tidyverse)
# Functions ----------------------------------------------------------------
# Function to get rid of spaces
strip_it <- function(x){
ch_vector <- strsplit(x, "")[[1]]
ch_vector[c(grep("\\s", ch_vector)) ] <- ""
indx <- sapply(ch_vector, nchar)
return(indx)
}
# Function to find the min
findMin <- function(chord, lyric, k){
# find closest value to chord
chs_note <- str_split(gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", chord, perl=TRUE), " ")[[1]]
pos_chr <- which(strsplit(chord, " ")[[1]] == chs_note[k])
pos_txt <- append(0, diff(strip_it(lyric)))
pos_txt[pos_txt == -1] <- 0
pos_min <- which(pos_txt == 1)
minim <- ifelse(pos_chr > pos_min,
which.min(abs(pos_chr - pos_min)),
which.min(abs(pos_min - pos_chr)))
# find whole word that starts at minim
pos_txt2 <- strip_it(lyric)
ch_vector <- strsplit(lyric, "")[[1]]
ch_vector[c(grep("\\s", ch_vector)) ] <- ""
indx <- as.data.frame(cbind(V1 = ch_vector,
V2 =sapply(ch_vector, nchar)))
rownames(indx) <- NULL
rownames(indx) <- as.numeric(rownames(indx))
complete <- paste(as.character(indx[pos_min[minim]:
ifelse(length(chs_note) > k,
which(strsplit(chord, " ")[[1]] == chs_note[k+1]),
nrow(indx))
, 1]), collapse = "-")
complete1 <- gsub("\\-\\-", " ", complete)
return(
gsub("\\-", "", complete1, perl=TRUE)
)
}
# Function for polishing data frame for single song
cleanUp <- function(x){
chords_lyrics <- rvest::html_nodes(x, "pre") %>%
rvest::html_text()
chords_lyrics <-
chords_lyrics %>%
str_remove_all(pattern = "[0-9]|[-][0-9][-]|\\||[0-9][br]|~")
chords <- tibble(V1 = sapply(chords_lyrics, function(x) strsplit(x, "\n")[[1]], USE.NAMES=FALSE))
chords.dat <- as.data.frame(cbind(rep(NA, nrow(chords)/2),
rep(NA, nrow(chords)/2)))
for (i in 1:nrow(chords)) {
ifelse(any(unlist(str_split(as.character(chords[i, 1]), " ")) %in% all_notes) == TRUE,
chords.dat[i, 1] <- as.character(chords[i, 1]),
chords.dat[i, 2] <- as.character(chords[i, 1])
)
}
chords.dat <- chords.dat[ grep("Intro:", chords.dat$V1, invert = TRUE), ]
chords.dat <- chords.dat[ grep("--", chords.dat$V2, invert = TRUE), ]
chords.dat[,1] <- sub("( *)(\\w+)", "\\2\\1", chords.dat[,1]) # put all the first chords at the beginning of the verse (keeping he same spaces between more than two chords)
chords.dat <- chords.dat %>%
mutate_at(c("V2"), funs(lead), n = 1 )
chords.dat <- chords.dat[complete.cases(chords.dat), ]
for (i in 1:nrow(chords.dat)) { # take only chords that are linked to some lyrics
if( nchar(chords.dat[i, 1]) > nchar(chords.dat[i, 2]) ) {
nch <- nchar(chords.dat[i, 1]) - nchar(chords.dat[i, 2])
chords.dat[i, 1] <- str_sub(chords.dat[i, 1], 1, str_length(chords.dat[i, 1])-nch)
}
}
chords.dat <- chords.dat[!apply(chords.dat == "", 1, all), ]
chords.dat$V2 <- trimws(chords.dat$V2, "l")
chords.dat$V2 <- paste(" ", chords.dat$V2)
if(is_empty(which(chords.dat[, 2] == " ")) == FALSE){ # remove occasional blank rows (e.g., chords.dat[[9]])
chords.dat <- chords.dat[-which(chords.dat[, 2] == " "), ]
}
rownames(chords.dat) <- NULL
rownames(chords.dat) <- as.numeric(rownames(chords.dat))
return(chords.dat)
}
# Function to eliminate remaining words
eliminate_words <- function(first_sentence, second_sentence){
if(is.na(second_sentence)) return(first_sentence)
first_word <- word(second_sentence, 1)
characters_first_word <- strsplit(first_word, split = "")[[1]]
# Get all possible words that could have remained in the previous
# sentence
words_list <- vector()
for(i in 1:length(characters_first_word)) {
words_list[i] <- paste(characters_first_word[1:i], collapse = "")
}
# Eliminate them
first_sentence <- str_split(first_sentence, " ")[[1]]
if(first_sentence[length(first_sentence)] %in% words_list){
first_sentence <- first_sentence[-length(first_sentence)]
}
first_sentence <- paste0(first_sentence, collapse = " ")
return(first_sentence)
}
# Function for matching lyrics with chords
createNet <- function(chords.dat){
chords.net <- as.data.frame(
bind_cols(
chord = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))),
lyric = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+")))
))
# chords
for( i in 1:nrow(chords.dat) ) {
for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {
if( str_count(chords.dat[i, 1], "\\S+") > 1 ){
for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {
chords.net[ match(as.numeric(do.call(paste0, expand.grid(i,j))),
sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- substring( chords.dat[i, 1], # find chords in verse i
sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j], # from this position
sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j]+1 ) # until this position
}
} else { chords.net[match(as.numeric(do.call(paste0, expand.grid(i,j))),
sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- chords.dat[i, 1] }
}
}
# lyrics
for( i in 1:nrow(chords.dat) ){
for ( k in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ){
chords.net[match(as.numeric(do.call(paste0, expand.grid(i,k))),
sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 2 ] <- suppressWarnings(findMin(chords.dat$V1[i], chords.dat$V2[i], k))
}
}
chords.net <- chords.net[complete.cases(chords.net$chord), ]
rownames(chords.net) <- NULL
rownames(chords.net) <- as.numeric(rownames(chords.net))
chords.net <- chords.net %>%
mutate(
second_sentence = lead(lyric, n = 1)) %>%
rowwise() %>%
mutate(
lyric =
eliminate_words(first_sentence = lyric,
second_sentence = second_sentence)) %>%
select(-second_sentence)
return(chords.net)
}
# Chords ------------------------------------------------------------------
notes = c('A','B','C','D','E','F','G')
flats = 'b'
minor = 'm'
sharps = '#'
all_notes = c(notes,
paste0(notes, flats),
paste0(notes, sharps),
paste0(notes, minor),
paste0(notes, flats, sharps),
paste0(notes, minor, sharps),
paste0(notes, flats, minor),
paste0(notes, flats, sharps, minor))
# Example -----------------------------------------------------------------
url <- get_songs("The Weeknd")
url_songs <- url$url[2]
x <- xml2::read_html(paste0("https://www.cifraclub.com.br",
url_songs))
# chords.dat
chords.dat <- cleanUp(x)
# chords.net
chords.net <- createNet(chords.dat)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment