A. Load Libraries

# Install
#install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer"))

# Load Libraries
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")

What languages does the ‘tm’ (text mining) library work with?

getStemLanguages()

##  [1] "danish"     "dutch"      "english"    "finnish"    "french"    
##  [6] "german"     "hungarian"  "italian"    "norwegian"  "porter"    
## [11] "portuguese" "romanian"   "russian"    "spanish"    "swedish"   
## [16] "turkish"

Snowball library carries out word stemming. It takes the root word (in many cases) and removes some prefixes and some suffixes.

wordStem(c("win", "wins", "winning", "winnings", "winner", 'unhappiness'))

## [1] "win"     "win"     "win"     "win"     "winner"  "unhappi"

Text Mining

The text is loaded using Corpus() (body in Latin) command from text mining (tm) package. Corpus is the body of work/data. If one had many documents one could simply concatenate (cat) them and redirect them into one .txt file.

We start by importing the text file created in Step 4

First work with the text from ONLY the second column of ‘R-cran’ text found on R-cran package short description followed by ‘Packages’ then ‘Table of available packages, sorted by name’

B. Import Text/Corpus/Data

#setwd('~/Desktop')
text <- readLines("Rp_col2.txt")

# Load the data as a corpus
docs <- Corpus(VectorSource(text))

# Inspect all the words by removing the comment line below.
#inspect(docs)

C. Text Preprocessing & Cleaning Text

Preprocessing is performed using tm_map() function to replace or remove special characters from the text, for example.

# Convert all text to lower case
docs <- tm_map(docs, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents

# Remove punctuations
docs <- tm_map(docs, removePunctuation)

## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents

# Remove numbers
docs <- tm_map(docs, removeNumbers)

## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents

# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents

# Remove your own stop word
# ADD stopwords as a character vector
docs <- tm_map(docs, removeWords, c("big", "small"))

## Warning in tm_map.SimpleCorpus(docs, removeWords, c("big", "small")):
## transformation drops documents

# Text stemming
docs <- tm_map(docs, stemDocument)

## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents

Display English Stopwords

stopwords(kind = "en")

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

Replacing “/”, “@” and “|” with space

It is even possible to be very specific regarding the symbols or words to remove.

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))

docs <- tm_map(docs, toSpace, "/")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "@")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents

docs <- tm_map(docs, toSpace, "\\|")

## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents

# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)

## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents

D. Build a Term-document Matrix

The Document-matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :

doc_mat <- TermDocumentMatrix(docs)

m <- as.matrix(doc_mat)

v <- sort(rowSums(m), decreasing = TRUE)

d_Rcran <- data.frame(word = names(v), freq = v)

head(d_Rcran, 5)

##              word freq
## data         data 2006
## model       model 1765
## analysi   analysi 1370
## function function  812
## estim       estim  788

E. Generate First Word Cloud from R-cran package descriptions

The importance of words can be illustrated as a word cloud where font size of the word shows its importance.

wordcloud(words = d_Rcran$word, 
          freq = d_Rcran$freq, 
          min.freq = 1,
          max.words = 100, 
          random.order = FALSE,
          rot.per = 0.0, # proportion words with 90 degree rotation
          colors = brewer.pal(4, "Set1"))

Going Further

Explore frequent terms and their associations

One can have a look at the frequent terms in the term-document-matrix as follows. In the example below we want to find the words which occur at least 100 times, frequency => 100.

Rcran_list <- findFreqTerms(doc_mat, lowfreq = 100)
findFreqTerms(doc_mat, lowfreq = 100)

##  [1] "access"      "model"       "predict"     "api"         "bayesian"   
##  [6] "comput"      "tool"        "data"        "analysi"     "base"       
## [11] "packag"      "implement"   "optim"       "estim"       "random"     
## [16] "via"         "analys"      "file"        "visual"      "associ"     
## [21] "use"         "multivari"   "network"     "measur"      "popul"      
## [26] "detect"      "simul"       "time"        "fit"         "regress"    
## [31] "gene"        "infer"       "seri"        "function"    "process"    
## [36] "valu"        "sampl"       "plot"        "spars"       "respons"    
## [41] "algorithm"   "design"      "multipl"     "select"      "dynam"      
## [46] "distribut"   "method"      "effect"      "linear"      "matrix"     
## [51] "code"        "robust"      "test"        "map"         "statist"    
## [56] "classif"     "read"        "class"       "mixtur"      "general"    
## [61] "learn"       "surviv"      "graphic"     "interfac"    "densiti"    
## [66] "genet"       "spatial"     "calcul"      "cluster"     "fast"       
## [71] "inform"      "studi"       "object"      "equat"       "curv"       
## [76] "variabl"     "set"         "dataset"     "util"        "generat"    
## [81] "creat"       "databas"     "interact"    "sequenc"     "client"     
## [86] "mix"         "weight"      "analyz"      "system"      "nonparametr"
## [91] "structur"    "tabl"        "likelihood"  "tree"        "librari"    
## [96] "correl"      "interv"      "covari"      "differenti"

findAssocs(doc_mat, terms = "model", corlimit = 0.05)

## $model
##            linear               mix           general               fit 
##              0.24              0.21              0.18              0.17 
##            mixtur          bayesian            hidden       mixedeffect 
##              0.13              0.12              0.10              0.10 
##             addit       autoregress          gaussian            markov 
##              0.09              0.09              0.09              0.09 
##            hazard            latent               cox             logit 
##              0.09              0.09              0.09              0.08 
##            effect      semiparametr            averag             equat 
##              0.07              0.07              0.07              0.07 
##          structur             joint         loglinear             rasch 
##              0.07              0.07              0.07              0.07 
##           frailti           multist           predict             estim 
##              0.07              0.07              0.06              0.06 
##           regress           respons             dynam             speci 
##              0.06              0.06              0.06              0.06 
##          hierarch          hydrolog         nonlinear             garch 
##              0.06              0.06              0.06              0.06 
##         generalis             topic exponentialfamili            select 
##              0.06              0.06              0.06              0.05 
##          stochast           atlanti          diagnost          agedepth 
##              0.05              0.05              0.05              0.05 
##         exponenti             panel            probit      illnessdeath 
##              0.05              0.05              0.05              0.05

head(d_Rcran, 10)

##              word freq
## data         data 2006
## model       model 1765
## analysi   analysi 1370
## function function  812
## estim       estim  788
## use           use  743
## regress   regress  583
## test         test  569
## tool         tool  555
## method     method  473

Plot Word frequencies of R-cran descriptions

The frequency of the first 25 words are plotted.

barplot(d_Rcran[1:25,]$freq, 
        las = 2, 
        names.arg = d_Rcran[1:25,]$word,
        col ="lightyellow", 
        main ="Most Frequent Words From R-cran Packages",
        ylab = "Word Count")

Word Cloud Tutorial

mcc

July 23, 2018