Our goal is to roughly show how to accomplish the “Example: Jazz Musicians” section of Chapter 10 of Data Science for Business, using R.
if (!require('RCurl')) install.packages('RCurl')
if (!require('XML')) install.packages('XML')
if (!require('tm')) install.packages('tm')
if (!require('stringr')) install.packages('stringr')
if (!require('SnowballC')) install.packages('SnowballC')
if (!require('wordcloud')) install.packages('wordcloud')
if (!require('lsa')) install.packages('lsa')
# function to grab only text in paragraphs
gettext <- function(url) {
init <- getURLContent(url)
parsed <- htmlParse(init)
ready <- xpathApply(parsed, '//p', xmlValue)
ready <- gsub('\\n', ' ', ready)
ready <- paste(ready, sep = "", collapse = "")
ready
}
# get info from wiki
if (!exists('cParker')) cParker <- gettext("https://en.wikipedia.org/wiki/Charlie_Parker")
if (!exists('mDavis')) mDavis <- gettext("https://en.wikipedia.org/wiki/Miles_Davis")
if (!exists('dEllington')) dEllington <- gettext("https://en.wikipedia.org/wiki/Duke_Ellington")
if (!exists('jColtrane')) jColtrane <- gettext("https://en.wikipedia.org/wiki/John_Coltrane")
if (!exists('dGillespie')) dGillespie <- gettext("https://en.wikipedia.org/wiki/Dizzy_Gillespie")
if (!exists('lArmstrong')) lArmstrong <- gettext("https://en.wikipedia.org/wiki/Louis_Armstrong")
if (!exists('cHawkins')) cHawkins <- gettext("https://en.wikipedia.org/wiki/Coleman_Hawkins")
if (!exists('cMingus')) cMingus <- gettext("https://en.wikipedia.org/wiki/Charles_Mingus")
# setup vector for later
list <- c("Charlie_Parker", "dEllington", "jColtrane", "mDavis",
"dGillespie", "lArmstrong", "cHawkins", "cMingus", "query")
# we will include the query in our corpus, so when we later look use cosine similarity we it will be included.
query <- c("famous jazz saxophonist born in kansas who played bebop and latin")
corp <- c(Corpus(VectorSource(cParker)),
Corpus(VectorSource(dEllington)),
Corpus(VectorSource(jColtrane)),
Corpus(VectorSource(mDavis)),
Corpus(VectorSource(dGillespie)),
Corpus(VectorSource(lArmstrong)),
Corpus(VectorSource(cHawkins)),
Corpus(VectorSource(cMingus)),
Corpus(VectorSource(query)))
## clean corpus
CleanCorp <- tm_map(corp, removeNumbers)
CleanCorp <- tm_map(CleanCorp, removePunctuation)
CleanCorp <- tm_map(CleanCorp, content_transformer(tolower))
CleanCorp <- tm_map(CleanCorp, removeWords, stopwords("english"))
CleanCorp <- tm_map(CleanCorp, stemDocument, language="en")
For fun, let’s make a quick word cloud of the top terms in our corpus before converting to a term document matrix.
wordcloud(CleanCorp, max.words = 100, random.order = FALSE, colors=brewer.pal(5,"Set1"))
Here we will create the Term Document Matrix and search for our query words. The TDF is much larger than appears, as it has counts of all the words by document. NOTE: A document in this corpus = each jazz musician’s Wiki page.
## create TDM
tdm <- TermDocumentMatrix(CleanCorp)
tdm$dimnames$Docs <- list
## Since we're mostly going to review our query, we don't need to remove Sparse Terms
#tdm <- removeSparseTerms(tdm, 1-(3/length(CleanCorp)))
Our goal is to search for “famous jazz saxophonist born in kansas who played bebop and latin”. First we’ll look only at TF. Then we’ll use the TermDocumentMatrix weighting function to calculate and display TF-IDF.
search <- c("famous","jazz", "saxophonist", "born", "kansa", "play", "bebop", "latin")
r1 <- inspect(tdm[search[search %in% Terms(tdm)], dimnames(tdm)$Docs])
## <<TermDocumentMatrix (terms: 8, documents: 9)>>
## Non-/sparse entries: 56/16
## Sparsity : 22%
## Maximal term length: 11
## Weighting : term frequency (tf)
##
## Docs
## Terms Charlie_Parker dEllington jColtrane mDavis dGillespie
## famous 1 2 0 1 5
## jazz 21 27 24 55 31
## saxophonist 1 4 7 12 1
## born 1 3 4 1 1
## kansa 6 0 0 0 0
## play 11 32 30 43 16
## bebop 7 1 3 7 4
## latin 0 1 0 0 2
## Docs
## Terms lArmstrong cHawkins cMingus query
## famous 11 0 0 1
## jazz 34 11 27 1
## saxophonist 0 4 4 1
## born 2 2 2 1
## kansa 0 1 0 1
## play 46 6 19 1
## bebop 0 3 1 1
## latin 1 0 0 1
tfidf <- TermDocumentMatrix(CleanCorp, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
tfidf$dimnames$Docs <- list
r2 <- inspect(tfidf[search[search %in% Terms(tfidf)], dimnames(tfidf)$Docs])
## <<TermDocumentMatrix (terms: 8, documents: 9)>>
## Non-/sparse entries: 29/43
## Sparsity : 60%
## Maximal term length: 11
## Weighting : term frequency - inverse document frequency (tf-idf)
##
## Docs
## Terms Charlie_Parker dEllington jColtrane mDavis dGillespie
## famous 0.5849625 1.169925 0.000000 0.5849625 2.924813
## jazz 0.0000000 0.000000 0.000000 0.0000000 0.000000
## saxophonist 0.1699250 0.679700 1.189475 2.0391000 0.169925
## born 0.0000000 0.000000 0.000000 0.0000000 0.000000
## kansa 9.5097750 0.000000 0.000000 0.0000000 0.000000
## play 0.0000000 0.000000 0.000000 0.0000000 0.000000
## bebop 1.1894750 0.169925 0.509775 1.1894750 0.679700
## latin 0.0000000 1.169925 0.000000 0.0000000 2.339850
## Docs
## Terms lArmstrong cHawkins cMingus query
## famous 6.434588 0.000000 0.000000 0.5849625
## jazz 0.000000 0.000000 0.000000 0.0000000
## saxophonist 0.000000 0.679700 0.679700 0.1699250
## born 0.000000 0.000000 0.000000 0.0000000
## kansa 0.000000 1.584963 0.000000 1.5849625
## play 0.000000 0.000000 0.000000 0.0000000
## bebop 0.000000 0.509775 0.169925 0.1699250
## latin 1.169925 0.000000 0.000000 1.1699250
We use the cosine function from the lsa library to find the cosine similarity of the documents. This is ultimately why we added our query text to the corpus, so we could then compare the others to it in the resulting matrix.
# for fun, let's look at all of the results
final <- cosine(r2)
final
## Charlie_Parker dEllington jColtrane mDavis dGillespie
## Charlie_Parker 1.00000000 0.05807045 0.0650556 0.09006485 0.06963707
## dEllington 0.05807045 1.00000000 0.3849612 0.52003471 0.93334585
## jColtrane 0.06505560 0.38496122 1.0000000 0.96328911 0.11125235
## mDavis 0.09006485 0.52003471 0.9632891 1.00000000 0.30923972
## dGillespie 0.06963707 0.93334585 0.1112524 0.30923972 1.00000000
## lArmstrong 0.05993066 0.75710894 0.0000000 0.23664088 0.86501947
## cHawkins 0.91457922 0.16979006 0.4590681 0.45553296 0.06741848
## cMingus 0.04720727 0.38993637 0.9872411 0.93200786 0.08652371
## query 0.78744650 0.59111552 0.1078407 0.17702810 0.58253561
## lArmstrong cHawkins cMingus query
## Charlie_Parker 0.05993066 0.91457922 0.04720727 0.7874465
## dEllington 0.75710894 0.16979006 0.38993637 0.5911155
## jColtrane 0.00000000 0.45906808 0.98724112 0.1078407
## mDavis 0.23664088 0.45553296 0.93200786 0.1770281
## dGillespie 0.86501947 0.06741848 0.08652371 0.5825356
## lArmstrong 1.00000000 0.00000000 0.00000000 0.3793183
## cHawkins 0.00000000 1.00000000 0.43543016 0.7294875
## cMingus 0.00000000 0.43543016 1.00000000 0.0995961
## query 0.37931830 0.72948755 0.09959610 1.0000000
#now we subset so we only see them each compared to the query
final2 <- final[1:(dim(final)[1] -1),ncol(final)]
pfinal <- final2[order(final2, decreasing=TRUE)]
pfinal <- round(pfinal, 4)
p <- barplot(pfinal, main="Cosine Similarity to Query", las=2)
text(x = p, y = pfinal, label = pfinal, pos=1.2, cex = 0.8)
We can see from the above plot that our query text most resembles the Wikipedia text for Charlie Parker. I added Coleman Hawkins, since he is also a sax player from Kansas, but we can see that Bird still comes out ahead (“famous” plays a part).