lucacarbonelc · June 15, 2020 15:08
diff --git a/chords + lyrics b/chords + lyrics
 library(chorrrds)
 library(tidyverse)

 # Functions ----------------------------------------------------------------

 # Function to get rid of spaces
 strip_it <- function(x){
  ch_vector <- strsplit(x, "")[[1]]
  ch_vector[c(grep("\\s", ch_vector)) ] <- ""
  indx      <- sapply(ch_vector, nchar)
  return(indx)
 }

 # Function to find the min
 findMin <- function(chord, lyric, k){
  
  # find closest value to chord
  chs_note <- str_split(gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", chord, perl=TRUE), " ")[[1]]
  pos_chr  <- which(strsplit(chord, " ")[[1]] == chs_note[k])
  pos_txt  <- append(0, diff(strip_it(lyric)))
  pos_txt[pos_txt == -1] <- 0
  pos_min  <- which(pos_txt == 1)
  minim    <- ifelse(pos_chr > pos_min,
                     which.min(abs(pos_chr - pos_min)),
                     which.min(abs(pos_min - pos_chr)))
  
  # find whole word that starts at minim
  pos_txt2  <- strip_it(lyric)
  ch_vector <- strsplit(lyric, "")[[1]]
  ch_vector[c(grep("\\s", ch_vector)) ] <- ""
  indx      <- as.data.frame(cbind(V1 = ch_vector, 
                                   V2 =sapply(ch_vector, nchar)))
  rownames(indx) <- NULL
  rownames(indx) <- as.numeric(rownames(indx))
  
  complete <-  paste(as.character(indx[pos_min[minim]:
                                         ifelse(length(chs_note) > k,
                                                which(strsplit(chord, " ")[[1]] == chs_note[k+1]),
                                                nrow(indx))
                                       , 1]), collapse = "-")
  complete1 <- gsub("\\-\\-", " ", complete)
  
  return(
    
    gsub("\\-", "", complete1, perl=TRUE)
    
  )
 }

 # Function for polishing data frame for single song
 cleanUp <- function(x){
  chords_lyrics <- rvest::html_nodes(x, "pre") %>% 
    rvest::html_text()
  
  chords_lyrics <- 
    chords_lyrics %>% 
    str_remove_all(pattern = "[0-9]|[-][0-9][-]|\\||[0-9][br]|~")
  
  chords <- tibble(V1 = sapply(chords_lyrics, function(x) strsplit(x, "\n")[[1]], USE.NAMES=FALSE))
  
  chords.dat <- as.data.frame(cbind(rep(NA, nrow(chords)/2),
                                    rep(NA, nrow(chords)/2)))
  
  for (i in 1:nrow(chords)) {
    
    ifelse(any(unlist(str_split(as.character(chords[i, 1]), " ")) %in% all_notes) == TRUE,
           chords.dat[i, 1] <- as.character(chords[i, 1]),
           chords.dat[i, 2] <- as.character(chords[i, 1])
    )
    
  }
  
  chords.dat <- chords.dat[ grep("Intro:", chords.dat$V1, invert = TRUE), ]
  chords.dat <- chords.dat[ grep("--", chords.dat$V2, invert = TRUE), ]
  chords.dat[,1] <- sub("( *)(\\w+)", "\\2\\1", chords.dat[,1])    # put all the first chords at the beginning of the verse (keeping he same spaces between more than two chords)
  chords.dat <- chords.dat %>%
    mutate_at(c("V2"), funs(lead), n = 1 )
  chords.dat <- chords.dat[complete.cases(chords.dat), ]
  
  for (i in 1:nrow(chords.dat)) {                                  # take only chords that are linked to some lyrics
    
    if( nchar(chords.dat[i, 1]) > nchar(chords.dat[i, 2]) ) {
      
      nch <- nchar(chords.dat[i, 1]) - nchar(chords.dat[i, 2])
      chords.dat[i, 1] <- str_sub(chords.dat[i, 1], 1, str_length(chords.dat[i, 1])-nch)
      
    }
  }

  chords.dat <- chords.dat[!apply(chords.dat == "", 1, all), ]
  chords.dat$V2 <- trimws(chords.dat$V2, "l")
  chords.dat$V2 <- paste(" ", chords.dat$V2)
  if(is_empty(which(chords.dat[, 2] == "  ")) == FALSE){           # remove occasional blank rows (e.g., chords.dat[[9]])
    chords.dat <- chords.dat[-which(chords.dat[, 2] == "  "), ]
  }
  rownames(chords.dat) <- NULL
  rownames(chords.dat) <- as.numeric(rownames(chords.dat))
  return(chords.dat)
 }

 # Function to eliminate remaining words
 eliminate_words <- function(first_sentence, second_sentence){
  if(is.na(second_sentence)) return(first_sentence)
  
  first_word <- word(second_sentence, 1)
  characters_first_word <- strsplit(first_word, split = "")[[1]]
  
  # Get all possible words that could have remained in the previous 
  # sentence
  words_list <- vector()
  for(i in 1:length(characters_first_word)) {
    words_list[i] <- paste(characters_first_word[1:i], collapse = "")
    
  }
  
  # Eliminate them
  first_sentence <- str_split(first_sentence, " ")[[1]]
  
  if(first_sentence[length(first_sentence)] %in% words_list){
    first_sentence <- first_sentence[-length(first_sentence)]
  }
  
  first_sentence <- paste0(first_sentence, collapse = " ")
  return(first_sentence)
  
 }

 # Function for matching lyrics with chords
 createNet <- function(chords.dat){
  
  chords.net <- as.data.frame(
    bind_cols(
      chord = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))),
      lyric = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+")))
    ))
  
  # chords
  for( i in 1:nrow(chords.dat) ) {
    for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {
      if( str_count(chords.dat[i, 1], "\\S+") > 1 ){
        
        for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {
          
          chords.net[ match(as.numeric(do.call(paste0, expand.grid(i,j))),
                            sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- substring( chords.dat[i, 1],                                                         # find chords in verse i
                                                                                                                                                                                                                                    sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j],    # from this position
                                                                                                                                                                                                                                    sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j]+1 )  # until this position
          
        }
        
      } else { chords.net[match(as.numeric(do.call(paste0, expand.grid(i,j))),
                                sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- chords.dat[i, 1] }
      
    }
  }
  
  
  # lyrics
  for( i in 1:nrow(chords.dat) ){
    for ( k in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ){
      
      chords.net[match(as.numeric(do.call(paste0, expand.grid(i,k))),
                       sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 2 ] <- suppressWarnings(findMin(chords.dat$V1[i], chords.dat$V2[i], k)) 
      
    }
  }
  
  chords.net <- chords.net[complete.cases(chords.net$chord), ]
  rownames(chords.net) <- NULL
  rownames(chords.net) <- as.numeric(rownames(chords.net))
  chords.net <- chords.net %>%
    mutate(
      second_sentence = lead(lyric, n = 1)) %>% 
    rowwise() %>% 
    mutate(
      lyric = 
        eliminate_words(first_sentence = lyric, 
                        second_sentence = second_sentence)) %>% 
    select(-second_sentence)
  
  return(chords.net)
 }


 # Chords ------------------------------------------------------------------

 notes     = c('A','B','C','D','E','F','G')
 flats     = 'b'
 minor     = 'm'
 sharps    = '#'
 all_notes = c(notes, 
              paste0(notes, flats),
              paste0(notes, sharps),
              paste0(notes, minor),
              paste0(notes, flats, sharps),
              paste0(notes, minor, sharps),
              paste0(notes, flats, minor),
              paste0(notes, flats, sharps, minor))



 # Example -----------------------------------------------------------------

 url       <- get_songs("The Weeknd")
 url_songs <- url$url[2]
 x <- xml2::read_html(paste0("https://www.cifraclub.com.br",
                            url_songs))

 # chords.dat
 chords.dat <- cleanUp(x)

 # chords.net
 chords.net <- createNet(chords.dat)
	library(chorrrds)
	library(tidyverse)

	# Functions ----------------------------------------------------------------

	# Function to get rid of spaces
	strip_it <- function(x){
	ch_vector <- strsplit(x, "")[[1]]
	ch_vector[c(grep("\\s", ch_vector)) ] <- ""
	indx <- sapply(ch_vector, nchar)
	return(indx)
	}

	# Function to find the min
	findMin <- function(chord, lyric, k){

	# find closest value to chord
	chs_note <- str_split(gsub("(?<=[\\s])\\s*\|^\\s+\|\\s+$", "", chord, perl=TRUE), " ")[[1]]
	pos_chr <- which(strsplit(chord, " ")[[1]] == chs_note[k])
	pos_txt <- append(0, diff(strip_it(lyric)))
	pos_txt[pos_txt == -1] <- 0
	pos_min <- which(pos_txt == 1)
	minim <- ifelse(pos_chr > pos_min,
	which.min(abs(pos_chr - pos_min)),
	which.min(abs(pos_min - pos_chr)))

	# find whole word that starts at minim
	pos_txt2 <- strip_it(lyric)
	ch_vector <- strsplit(lyric, "")[[1]]
	ch_vector[c(grep("\\s", ch_vector)) ] <- ""
	indx <- as.data.frame(cbind(V1 = ch_vector,
	V2 =sapply(ch_vector, nchar)))
	rownames(indx) <- NULL
	rownames(indx) <- as.numeric(rownames(indx))

	complete <- paste(as.character(indx[pos_min[minim]:
	ifelse(length(chs_note) > k,
	which(strsplit(chord, " ")[[1]] == chs_note[k+1]),
	nrow(indx))
	, 1]), collapse = "-")
	complete1 <- gsub("\\-\\-", " ", complete)

	return(

	gsub("\\-", "", complete1, perl=TRUE)

	)
	}

	# Function for polishing data frame for single song
	cleanUp <- function(x){
	chords_lyrics <- rvest::html_nodes(x, "pre") %>%
	rvest::html_text()

	chords_lyrics <-
	chords_lyrics %>%
	str_remove_all(pattern = "[0-9]\|[-][0-9][-]\|\\\|\|[0-9][br]\|~")

	chords <- tibble(V1 = sapply(chords_lyrics, function(x) strsplit(x, "\n")[[1]], USE.NAMES=FALSE))

	chords.dat <- as.data.frame(cbind(rep(NA, nrow(chords)/2),
	rep(NA, nrow(chords)/2)))

	for (i in 1:nrow(chords)) {

	ifelse(any(unlist(str_split(as.character(chords[i, 1]), " ")) %in% all_notes) == TRUE,
	chords.dat[i, 1] <- as.character(chords[i, 1]),
	chords.dat[i, 2] <- as.character(chords[i, 1])
	)

	}

	chords.dat <- chords.dat[ grep("Intro:", chords.dat$V1, invert = TRUE), ]
	chords.dat <- chords.dat[ grep("--", chords.dat$V2, invert = TRUE), ]
	chords.dat[,1] <- sub("( *)(\\w+)", "\\2\\1", chords.dat[,1]) # put all the first chords at the beginning of the verse (keeping he same spaces between more than two chords)
	chords.dat <- chords.dat %>%
	mutate_at(c("V2"), funs(lead), n = 1 )
	chords.dat <- chords.dat[complete.cases(chords.dat), ]

	for (i in 1:nrow(chords.dat)) { # take only chords that are linked to some lyrics

	if( nchar(chords.dat[i, 1]) > nchar(chords.dat[i, 2]) ) {

	nch <- nchar(chords.dat[i, 1]) - nchar(chords.dat[i, 2])
	chords.dat[i, 1] <- str_sub(chords.dat[i, 1], 1, str_length(chords.dat[i, 1])-nch)

	}
	}

	chords.dat <- chords.dat[!apply(chords.dat == "", 1, all), ]
	chords.dat$V2 <- trimws(chords.dat$V2, "l")
	chords.dat$V2 <- paste(" ", chords.dat$V2)
	if(is_empty(which(chords.dat[, 2] == " ")) == FALSE){ # remove occasional blank rows (e.g., chords.dat[[9]])
	chords.dat <- chords.dat[-which(chords.dat[, 2] == " "), ]
	}
	rownames(chords.dat) <- NULL
	rownames(chords.dat) <- as.numeric(rownames(chords.dat))
	return(chords.dat)
	}

	# Function to eliminate remaining words
	eliminate_words <- function(first_sentence, second_sentence){
	if(is.na(second_sentence)) return(first_sentence)

	first_word <- word(second_sentence, 1)
	characters_first_word <- strsplit(first_word, split = "")[[1]]

	# Get all possible words that could have remained in the previous
	# sentence
	words_list <- vector()
	for(i in 1:length(characters_first_word)) {
	words_list[i] <- paste(characters_first_word[1:i], collapse = "")

	}

	# Eliminate them
	first_sentence <- str_split(first_sentence, " ")[[1]]

	if(first_sentence[length(first_sentence)] %in% words_list){
	first_sentence <- first_sentence[-length(first_sentence)]
	}

	first_sentence <- paste0(first_sentence, collapse = " ")
	return(first_sentence)

	}

	# Function for matching lyrics with chords
	createNet <- function(chords.dat){

	chords.net <- as.data.frame(
	bind_cols(
	chord = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))),
	lyric = rep(NA, sum(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))*max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+")))
	))

	# chords
	for( i in 1:nrow(chords.dat) ) {
	for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {
	if( str_count(chords.dat[i, 1], "\\S+") > 1 ){

	for ( j in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ) {

	chords.net[ match(as.numeric(do.call(paste0, expand.grid(i,j))),
	sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- substring( chords.dat[i, 1], # find chords in verse i
	sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j], # from this position
	sort(unique(na.omit(str_locate(chords.dat[i, 1], all_notes)[,1])))[j]+1 ) # until this position

	}

	} else { chords.net[match(as.numeric(do.call(paste0, expand.grid(i,j))),
	sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 1 ] <- chords.dat[i, 1] }

	}
	}


	# lyrics
	for( i in 1:nrow(chords.dat) ){
	for ( k in 1:str_count(substring(chords.dat[i, 1], 1, nchar(chords.dat[i, 2])), "\\S+") ){

	chords.net[match(as.numeric(do.call(paste0, expand.grid(i,k))),
	sort(as.numeric(do.call(paste0, expand.grid(1:sum(na.omit(str_count(chords.dat[, 1], "\\S+"))), 1:max(str_count(substring(chords.dat[, 1], 1, nchar(chords.dat[, 2])), "\\S+"))))))), 2 ] <- suppressWarnings(findMin(chords.dat$V1[i], chords.dat$V2[i], k))

	}
	}

	chords.net <- chords.net[complete.cases(chords.net$chord), ]
	rownames(chords.net) <- NULL
	rownames(chords.net) <- as.numeric(rownames(chords.net))
	chords.net <- chords.net %>%
	mutate(
	second_sentence = lead(lyric, n = 1)) %>%
	rowwise() %>%
	mutate(
	lyric =
	eliminate_words(first_sentence = lyric,
	second_sentence = second_sentence)) %>%
	select(-second_sentence)

	return(chords.net)
	}


	# Chords ------------------------------------------------------------------

	notes = c('A','B','C','D','E','F','G')
	flats = 'b'
	minor = 'm'
	sharps = '#'
	all_notes = c(notes,
	paste0(notes, flats),
	paste0(notes, sharps),
	paste0(notes, minor),
	paste0(notes, flats, sharps),
	paste0(notes, minor, sharps),
	paste0(notes, flats, minor),
	paste0(notes, flats, sharps, minor))



	# Example -----------------------------------------------------------------

	url <- get_songs("The Weeknd")
	url_songs <- url$url[2]
	x <- xml2::read_html(paste0("https://www.cifraclub.com.br",
	url_songs))

	# chords.dat
	chords.dat <- cleanUp(x)

	# chords.net
	chords.net <- createNet(chords.dat)