Word Clouds from lyrics of popular artist· Painted River

Create word cloud from lyrics of top 30 songs This is a tutorial of how to use the XML package and natural text processing to create word clouds of song lyrics from artist.
Step 1: First load the library needed. Make a directory called songs in your current working directory.

library(wordcloud)
library(tm)
library(ctv)
library(SnowballC)
library(XML)
library(RColorBrewer)
# install.views("NaturalLanguageProcessing") # install if needed

Next set up the artist you would like to build a word cloud from, and place it below in url_m.
For Peter Gabriel for example use p/peter+gabriel/ or for Robin Thicke use r/robin+thicke/.
This will then grab the last 30 lyrics loaded and produce a local file to be parsed.

# put the name of artist in url_m with the folder location
url_m = "b/bob+marley"
url_base = "http://www.lyricsfreak.com/"
url_end = "/last_30_lyrics.xml"

# this site is publishing a nice xml feed with links to the last 30 songs by this artist. Instead of doing screen scrape
# main page for simplicity just use this xml feed
url = paste(url_base, url_m, url_end, sep="")

# this creates list of the 30 links to the lyrics as an html page a removes all else
links = htmlParse(url)
src = xpathApply(links, "//a[@href]", xmlGetAttr, "href")

This next section will loop through the list of 30 song url and create a text file inside a folder named uniquely for the artist. TODO: need to make sure the artist name is printed out fully.

# this loops through the 30 links and downloads the html file and keeps just to song content
# it works, but takes awhile to parse the xml functions real work is in content section
for (i in 1:length(src)) {
  url = paste(src[i])
  pagetree <- htmlTreeParse(url)
  body = pagetree[[1]]
  content = body$children[[2]]$children$div$children[[3]]
  # get the name of song to save as text file
  nm = gsub("[0-9 /.html/_+]", "", unlist(strsplit(url, "/"))[6])
  # get the artis name to make folder to store text files
  artist = paste(gsub("[0-9 /.html/_+]", "", unlist(strsplit(url, "/"))[5]))
  # this creates dir with the name and throws off 29 warnings that the folder exits already
  # i am being lazy and should add a if statement here
  dir.create(file.path("songs", artist), showWarnings=FALSE)
  # write out the file as text
  name = paste("songs/", artist, "/", nm, ".txt", sep="")
  write(unlist(content), name)

}

Now the test processing parts come into play.

# get the folder where all files are
cname <- file.path("songs", artist)
# this creates a corpus of all files as one list.  Not sure how this works it just does
docs <- Corpus(DirSource(cname))

for( j in seq(docs))
{ docs[[j]] <- gsub("content_h", " ", docs[[j]])
  docs[[j]] <- gsub("'", " ", docs[[j]])
  docs[[j]] <- gsub("br", " ", docs[[j]])
  docs[[j]] <- gsub("text", " ", docs[[j]])
  docs[[j]] <- gsub("div", " ", docs[[j]])
# i have not figured out way to change misspelling of aint don't which get used a lot in rap songs  
}
# format the text to remove the funky elements
docs <- tm_map(docs, tolower)

## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected

docs <- tm_map(docs, removeWords, stopwords("english"))

## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected

docs <- tm_map(docs, removeNumbers)

## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected

docs <- tm_map(docs, removePunctuation)

## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected

docs <- tm_map(docs, stripWhitespace)

## Warning: scheduled core 1 encountered error in user code, all values of
## the job will be affected

dtm <- DocumentTermMatrix(docs)

## Warning: scheduled core 1 encountered error in user code, all values of the job will be affected
## Warning: invalid document identifiers
## Warning: NAs introduced by coercion

m <- as.matrix(dtm)
v <- sort(colSums(m), decreasing=TRUE)
head(v, 14)

##   stand     don     get   gonna   round   worry    give whatcha     bad
##      57      48      45      35      35      33      31      31      30
##    love   happy   jsnav    lord     now
##      30      29      28      28      28

words <- names(v)
d <- data.frame(word=words, freq=v)
loc =paste(cname, "/", artist, ".png", sep="")
# get the theme from colorBrewer
pal2 <- brewer.pal(8,"Dark2")
# this creates a permanant png to be viewed.
png(loc, width=800, height=600)
wordcloud(d$word, d$freq, min.freq=10, colors=pal2, scale=c(8,.2), rot.per=.15, random.order=F, max.words=Inf)
# wordcloud(d$word, d$freq, min.freq=20)
dev.off()

## pdf
##   2