Create word cloud from lyrics of top 30 songs
This is a tutorial of how to use the XML package and natural text processing to create word clouds of song lyrics from artist.
Step 1: First load the library needed. Make a directory called songs in your current working directory.
library(wordcloud)
library(tm)
library(ctv)
library(SnowballC)
library(XML)
library(RColorBrewer)
# install.views("NaturalLanguageProcessing") # install if needed
Next set up the artist you would like to build a word cloud from, and place it below in url_m.
For Peter Gabriel for example use p/peter+gabriel/ or for Robin Thicke use r/robin+thicke/.
This will then grab the last 30 lyrics loaded and produce a local file to be parsed.
# put the name of artist in url_m with the folder location
url_m = "b/bob+marley"
url_base = "http://www.lyricsfreak.com/"
url_end = "/last_30_lyrics.xml"
# this site is publishing a nice xml feed with links to the last 30 songs by this artist. Instead of doing screen scrape
# main page for simplicity just use this xml feed
url = paste(url_base, url_m, url_end, sep="")
# this creates list of the 30 links to the lyrics as an html page a removes all else
links = htmlParse(url)
src = xpathApply(links, "//a[@href]", xmlGetAttr, "href")
This next section will loop through the list of 30 song url and create a text file inside a folder named uniquely for the artist. TODO: need to make sure the artist name is printed out fully.
# this loops through the 30 links and downloads the html file and keeps just to song content
# it works, but takes awhile to parse the xml functions real work is in content section
for (i in 1:length(src)) {
url = paste(src[i])
pagetree <- htmlTreeParse(url)
body = pagetree[[1]]
content = body$children[[2]]$children$div$children[[3]]
# get the name of song to save as text file
nm = gsub("[0-9 /.html/_+]", "", unlist(strsplit(url, "/"))[6])
# get the artis name to make folder to store text files
artist = paste(gsub("[0-9 /.html/_+]", "", unlist(strsplit(url, "/"))[5]))
# this creates dir with the name and throws off 29 warnings that the folder exits already
# i am being lazy and should add a if statement here
dir.create(file.path("songs", artist), showWarnings=FALSE)
# write out the file as text
name = paste("songs/", artist, "/", nm, ".txt", sep="")
write(unlist(content), name)
}
Now the test processing parts come into play.
# get the folder where all files are
cname <- file.path("songs", artist)
# this creates a corpus of all files as one list. Not sure how this works it just does
docs <- Corpus(DirSource(cname))
for( j in seq(docs))
{ docs[[j]] <- gsub("content_h", " ", docs[[j]])
docs[[j]] <- gsub("'", " ", docs[[j]])
docs[[j]] <- gsub("br", " ", docs[[j]])
docs[[j]] <- gsub("text", " ", docs[[j]])
docs[[j]] <- gsub("div", " ", docs[[j]])
# i have not figured out way to change misspelling of aint don't which get used a lot in rap songs
}
# format the text to remove the funky elements
docs <- tm_map(docs, tolower)
## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected
docs <- tm_map(docs, removeNumbers)
## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected
docs <- tm_map(docs, removePunctuation)
## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected
docs <- tm_map(docs, stripWhitespace)
## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected
dtm <- DocumentTermMatrix(docs)
## Warning: scheduled core 1 encountered error in user code, all values of the job will be affected
## Warning: invalid document identifiers
## Warning: NAs introduced by coercion
m <- as.matrix(dtm)
v <- sort(colSums(m), decreasing=TRUE)
head(v, 14)
## stand don get gonna round worry give whatcha bad
## 57 48 45 35 35 33 31 31 30
## love happy jsnav lord now
## 30 29 28 28 28
words <- names(v)
d <- data.frame(word=words, freq=v)
loc =paste(cname, "/", artist, ".png", sep="")
# get the theme from colorBrewer
pal2 <- brewer.pal(8,"Dark2")
# this creates a permanant png to be viewed.
png(loc, width=800, height=600)
wordcloud(d$word, d$freq, min.freq=10, colors=pal2, scale=c(8,.2), rot.per=.15, random.order=F, max.words=Inf)
# wordcloud(d$word, d$freq, min.freq=20)
dev.off()
## pdf
## 2