Abstract

Our goal is to roughly show how to accomplish the “Example: Jazz Musicians” section of Chapter 10 of Data Science for Business, using R.

Environment Prep

if (!require('RCurl')) install.packages('RCurl')
if (!require('XML')) install.packages('XML')
if (!require('tm')) install.packages('tm')
if (!require('stringr')) install.packages('stringr')
if (!require('SnowballC')) install.packages('SnowballC')
if (!require('wordcloud')) install.packages('wordcloud')
if (!require('lsa')) install.packages('lsa')

Data Acquisition and Prep

Data Acquisition

# function to grab only text in paragraphs
gettext <- function(url) {
    init <- getURLContent(url) 
    parsed <- htmlParse(init)
    ready <- xpathApply(parsed, '//p', xmlValue)
    ready <- gsub('\\n', ' ', ready)
    ready <- paste(ready, sep = "", collapse = "")
    ready
}

# get info from wiki
if (!exists('cParker')) cParker <- gettext("https://en.wikipedia.org/wiki/Charlie_Parker")
if (!exists('mDavis')) mDavis <- gettext("https://en.wikipedia.org/wiki/Miles_Davis")
if (!exists('dEllington')) dEllington <- gettext("https://en.wikipedia.org/wiki/Duke_Ellington")
if (!exists('jColtrane')) jColtrane <- gettext("https://en.wikipedia.org/wiki/John_Coltrane")
if (!exists('dGillespie')) dGillespie <- gettext("https://en.wikipedia.org/wiki/Dizzy_Gillespie")
if (!exists('lArmstrong')) lArmstrong <- gettext("https://en.wikipedia.org/wiki/Louis_Armstrong")
if (!exists('cHawkins')) cHawkins <- gettext("https://en.wikipedia.org/wiki/Coleman_Hawkins")
if (!exists('cMingus')) cMingus <- gettext("https://en.wikipedia.org/wiki/Charles_Mingus")

Data Prep and Clean

# setup vector for later
list <- c("Charlie_Parker", "dEllington", "jColtrane", "mDavis", 
          "dGillespie", "lArmstrong", "cHawkins", "cMingus", "query")

# we will include the query in our corpus, so when we later look use cosine similarity we it will be included. 
query <- c("famous jazz saxophonist born in kansas who played bebop and latin")
corp <- c(Corpus(VectorSource(cParker)), 
          Corpus(VectorSource(dEllington)), 
          Corpus(VectorSource(jColtrane)),
          Corpus(VectorSource(mDavis)), 
          Corpus(VectorSource(dGillespie)),
          Corpus(VectorSource(lArmstrong)),
          Corpus(VectorSource(cHawkins)),
          Corpus(VectorSource(cMingus)),
          Corpus(VectorSource(query)))

## clean corpus
CleanCorp <- tm_map(corp, removeNumbers)
CleanCorp <- tm_map(CleanCorp, removePunctuation)
CleanCorp <- tm_map(CleanCorp, content_transformer(tolower))
CleanCorp <- tm_map(CleanCorp, removeWords, stopwords("english")) 
CleanCorp <- tm_map(CleanCorp, stemDocument, language="en")

Word Cloud

For fun, let’s make a quick word cloud of the top terms in our corpus before converting to a term document matrix.

wordcloud(CleanCorp, max.words = 100, random.order = FALSE, colors=brewer.pal(5,"Set1"))

Term Document Matrix

Here we will create the Term Document Matrix and search for our query words. The TDF is much larger than appears, as it has counts of all the words by document. NOTE: A document in this corpus = each jazz musician’s Wiki page.

## create TDM
tdm <- TermDocumentMatrix(CleanCorp)
tdm$dimnames$Docs <- list

## Since we're mostly going to review our query, we don't need to remove Sparse Terms
#tdm <- removeSparseTerms(tdm, 1-(3/length(CleanCorp)))

TF-IDF

Our goal is to search for “famous jazz saxophonist born in kansas who played bebop and latin”. First we’ll look only at TF. Then we’ll use the TermDocumentMatrix weighting function to calculate and display TF-IDF.

search <- c("famous","jazz", "saxophonist", "born", "kansa", "play", "bebop", "latin")
r1 <- inspect(tdm[search[search %in% Terms(tdm)], dimnames(tdm)$Docs])
## <<TermDocumentMatrix (terms: 8, documents: 9)>>
## Non-/sparse entries: 56/16
## Sparsity           : 22%
## Maximal term length: 11
## Weighting          : term frequency (tf)
## 
##              Docs
## Terms         Charlie_Parker dEllington jColtrane mDavis dGillespie
##   famous                   1          2         0      1          5
##   jazz                    21         27        24     55         31
##   saxophonist              1          4         7     12          1
##   born                     1          3         4      1          1
##   kansa                    6          0         0      0          0
##   play                    11         32        30     43         16
##   bebop                    7          1         3      7          4
##   latin                    0          1         0      0          2
##              Docs
## Terms         lArmstrong cHawkins cMingus query
##   famous              11        0       0     1
##   jazz                34       11      27     1
##   saxophonist          0        4       4     1
##   born                 2        2       2     1
##   kansa                0        1       0     1
##   play                46        6      19     1
##   bebop                0        3       1     1
##   latin                1        0       0     1
tfidf <- TermDocumentMatrix(CleanCorp, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
tfidf$dimnames$Docs <- list

r2 <- inspect(tfidf[search[search %in% Terms(tfidf)], dimnames(tfidf)$Docs])
## <<TermDocumentMatrix (terms: 8, documents: 9)>>
## Non-/sparse entries: 29/43
## Sparsity           : 60%
## Maximal term length: 11
## Weighting          : term frequency - inverse document frequency (tf-idf)
## 
##              Docs
## Terms         Charlie_Parker dEllington jColtrane    mDavis dGillespie
##   famous           0.5849625   1.169925  0.000000 0.5849625   2.924813
##   jazz             0.0000000   0.000000  0.000000 0.0000000   0.000000
##   saxophonist      0.1699250   0.679700  1.189475 2.0391000   0.169925
##   born             0.0000000   0.000000  0.000000 0.0000000   0.000000
##   kansa            9.5097750   0.000000  0.000000 0.0000000   0.000000
##   play             0.0000000   0.000000  0.000000 0.0000000   0.000000
##   bebop            1.1894750   0.169925  0.509775 1.1894750   0.679700
##   latin            0.0000000   1.169925  0.000000 0.0000000   2.339850
##              Docs
## Terms         lArmstrong cHawkins  cMingus     query
##   famous        6.434588 0.000000 0.000000 0.5849625
##   jazz          0.000000 0.000000 0.000000 0.0000000
##   saxophonist   0.000000 0.679700 0.679700 0.1699250
##   born          0.000000 0.000000 0.000000 0.0000000
##   kansa         0.000000 1.584963 0.000000 1.5849625
##   play          0.000000 0.000000 0.000000 0.0000000
##   bebop         0.000000 0.509775 0.169925 0.1699250
##   latin         1.169925 0.000000 0.000000 1.1699250

Conclusion

We use the cosine function from the lsa library to find the cosine similarity of the documents. This is ultimately why we added our query text to the corpus, so we could then compare the others to it in the resulting matrix.

# for fun, let's look at all of the results
final <- cosine(r2)
final
##                Charlie_Parker dEllington jColtrane     mDavis dGillespie
## Charlie_Parker     1.00000000 0.05807045 0.0650556 0.09006485 0.06963707
## dEllington         0.05807045 1.00000000 0.3849612 0.52003471 0.93334585
## jColtrane          0.06505560 0.38496122 1.0000000 0.96328911 0.11125235
## mDavis             0.09006485 0.52003471 0.9632891 1.00000000 0.30923972
## dGillespie         0.06963707 0.93334585 0.1112524 0.30923972 1.00000000
## lArmstrong         0.05993066 0.75710894 0.0000000 0.23664088 0.86501947
## cHawkins           0.91457922 0.16979006 0.4590681 0.45553296 0.06741848
## cMingus            0.04720727 0.38993637 0.9872411 0.93200786 0.08652371
## query              0.78744650 0.59111552 0.1078407 0.17702810 0.58253561
##                lArmstrong   cHawkins    cMingus     query
## Charlie_Parker 0.05993066 0.91457922 0.04720727 0.7874465
## dEllington     0.75710894 0.16979006 0.38993637 0.5911155
## jColtrane      0.00000000 0.45906808 0.98724112 0.1078407
## mDavis         0.23664088 0.45553296 0.93200786 0.1770281
## dGillespie     0.86501947 0.06741848 0.08652371 0.5825356
## lArmstrong     1.00000000 0.00000000 0.00000000 0.3793183
## cHawkins       0.00000000 1.00000000 0.43543016 0.7294875
## cMingus        0.00000000 0.43543016 1.00000000 0.0995961
## query          0.37931830 0.72948755 0.09959610 1.0000000
#now we subset so we only see them each compared to the query 
final2 <- final[1:(dim(final)[1] -1),ncol(final)]
pfinal <- final2[order(final2, decreasing=TRUE)]
pfinal <- round(pfinal, 4)
p <- barplot(pfinal, main="Cosine Similarity to Query", las=2) 
text(x = p, y = pfinal, label = pfinal, pos=1.2, cex = 0.8)

We can see from the above plot that our query text most resembles the Wikipedia text for Charlie Parker. I added Coleman Hawkins, since he is also a sax player from Kansas, but we can see that Bird still comes out ahead (“famous” plays a part).