# Install
#install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer"))
# Load Libraries
library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
getStemLanguages()
## [1] "danish" "dutch" "english" "finnish" "french"
## [6] "german" "hungarian" "italian" "norwegian" "porter"
## [11] "portuguese" "romanian" "russian" "spanish" "swedish"
## [16] "turkish"
wordStem(c("win", "wins", "winning", "winnings", "winner", 'unhappiness'))
## [1] "win" "win" "win" "win" "winner" "unhappi"
The text is loaded using Corpus() (body in Latin) command from text mining (tm) package. Corpus is the body of work/data. If one had many documents one could simply concatenate (cat) them and redirect them into one .txt file.
We start by importing the text file created in Step 4
First work with the text from ONLY the second column of ‘R-cran’ text found on R-cran package short description followed by ‘Packages’ then ‘Table of available packages, sorted by name’
#setwd('~/Desktop')
text <- readLines("Rp_col2.txt")
# Load the data as a corpus
docs <- Corpus(VectorSource(text))
# Inspect all the words by removing the comment line below.
#inspect(docs)
Preprocessing is performed using tm_map() function to replace or remove special characters from the text, for example.
# Convert all text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation
## drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# ADD stopwords as a character vector
docs <- tm_map(docs, removeWords, c("big", "small"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("big", "small")):
## transformation drops documents
# Text stemming
docs <- tm_map(docs, stemDocument)
## Warning in tm_map.SimpleCorpus(docs, stemDocument): transformation drops
## documents
stopwords(kind = "en")
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
It is even possible to be very specific regarding the symbols or words to remove.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "/"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "@")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "@"): transformation drops
## documents
docs <- tm_map(docs, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(docs, toSpace, "\\|"): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
The Document-matrix is a table containing the frequency of the words. Column names are words and row names are documents. The function TermDocumentMatrix() from text mining package can be used as follow :
doc_mat <- TermDocumentMatrix(docs)
m <- as.matrix(doc_mat)
v <- sort(rowSums(m), decreasing = TRUE)
d_Rcran <- data.frame(word = names(v), freq = v)
head(d_Rcran, 5)
## word freq
## data data 2006
## model model 1765
## analysi analysi 1370
## function function 812
## estim estim 788
The importance of words can be illustrated as a word cloud where font size of the word shows its importance.
wordcloud(words = d_Rcran$word,
freq = d_Rcran$freq,
min.freq = 1,
max.words = 100,
random.order = FALSE,
rot.per = 0.0, # proportion words with 90 degree rotation
colors = brewer.pal(4, "Set1"))
Explore frequent terms and their associations
One can have a look at the frequent terms in the term-document-matrix as follows. In the example below we want to find the words which occur at least 100 times, frequency => 100.
Rcran_list <- findFreqTerms(doc_mat, lowfreq = 100)
findFreqTerms(doc_mat, lowfreq = 100)
## [1] "access" "model" "predict" "api" "bayesian"
## [6] "comput" "tool" "data" "analysi" "base"
## [11] "packag" "implement" "optim" "estim" "random"
## [16] "via" "analys" "file" "visual" "associ"
## [21] "use" "multivari" "network" "measur" "popul"
## [26] "detect" "simul" "time" "fit" "regress"
## [31] "gene" "infer" "seri" "function" "process"
## [36] "valu" "sampl" "plot" "spars" "respons"
## [41] "algorithm" "design" "multipl" "select" "dynam"
## [46] "distribut" "method" "effect" "linear" "matrix"
## [51] "code" "robust" "test" "map" "statist"
## [56] "classif" "read" "class" "mixtur" "general"
## [61] "learn" "surviv" "graphic" "interfac" "densiti"
## [66] "genet" "spatial" "calcul" "cluster" "fast"
## [71] "inform" "studi" "object" "equat" "curv"
## [76] "variabl" "set" "dataset" "util" "generat"
## [81] "creat" "databas" "interact" "sequenc" "client"
## [86] "mix" "weight" "analyz" "system" "nonparametr"
## [91] "structur" "tabl" "likelihood" "tree" "librari"
## [96] "correl" "interv" "covari" "differenti"
findAssocs(doc_mat, terms = "model", corlimit = 0.05)
## $model
## linear mix general fit
## 0.24 0.21 0.18 0.17
## mixtur bayesian hidden mixedeffect
## 0.13 0.12 0.10 0.10
## addit autoregress gaussian markov
## 0.09 0.09 0.09 0.09
## hazard latent cox logit
## 0.09 0.09 0.09 0.08
## effect semiparametr averag equat
## 0.07 0.07 0.07 0.07
## structur joint loglinear rasch
## 0.07 0.07 0.07 0.07
## frailti multist predict estim
## 0.07 0.07 0.06 0.06
## regress respons dynam speci
## 0.06 0.06 0.06 0.06
## hierarch hydrolog nonlinear garch
## 0.06 0.06 0.06 0.06
## generalis topic exponentialfamili select
## 0.06 0.06 0.06 0.05
## stochast atlanti diagnost agedepth
## 0.05 0.05 0.05 0.05
## exponenti panel probit illnessdeath
## 0.05 0.05 0.05 0.05
head(d_Rcran, 10)
## word freq
## data data 2006
## model model 1765
## analysi analysi 1370
## function function 812
## estim estim 788
## use use 743
## regress regress 583
## test test 569
## tool tool 555
## method method 473
The frequency of the first 25 words are plotted.
barplot(d_Rcran[1:25,]$freq,
las = 2,
names.arg = d_Rcran[1:25,]$word,
col ="lightyellow",
main ="Most Frequent Words From R-cran Packages",
ylab = "Word Count")