TextMining Illustration

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#TEXT MINING
    #install.packages("tm")
    #install.packages("wordcloud")
    library(tm); library(SnowballC); library(ggplot2); library(wordcloud)

## Warning: package 'tm' was built under R version 3.1.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.1.3

## Warning: package 'SnowballC' was built under R version 3.1.1

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Loading required package: RColorBrewer

## Warning: package 'RColorBrewer' was built under R version 3.1.2

docs <- Corpus(DirSource("~/Documents/DataScience/textMiningPractice/ds"))
#inspect(docs) #Check details
writeLines(as.character(docs[[4]])) #inspect a particular document

## student-sandeep graduat: student Studi Comput scienc clemson univers clemson south carolina born rais india school prestigi colleg undergradu colleg click html resum download postscript version divers end involv thing handl manag nice time friend unlik love drink exot mix drink humor humor web humor archiv adrian humor collect laugh web miscellan joke yahoo pictur pavel art galleri florida girl pit page beauti women supermodel imag yahoo index miscellan supermodel pictur indian travel travel agent map airlin number travel agent special travel india miscellan travel relat link miscellan collect unrel link bui car univers sandeep exclus guest book email talk invit sign guest book comment suggest feel free send email clemson talk click check log address live address crazi roommat road clemson phone
## course   web oper system uniqu mwf tai recent explos interest world wide web result evolv set protocol protocol address tradit concern oper system interprocess commun resourc alloc secur gener context internet goal class provid understand current state art web oper system address problem solv provid matur gener purpos web oper system kei hypothesi design class issu address context web address tradit area oper system occasion read web relat paper bear understand current web problem class project report pointer internet research refer inform class syllabu read list schedul longer organ list paper class roster handout verif ssl protocol proof sketch guidelin final project talk report
## faculty  rami melhem professor dept comput scienc phone fax research public optic interconnect parallel system fault toler parallel distribut system parallel distribut comput teach fault toler code theori introduct inform structur introduct oper system ongo fund research project Profession ACTIVE are CLICK- is SEND is Mail

#preprocessing
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ", x))})
    docs <- tm_map(docs, toSpace, "-")
    docs <- tm_map(docs, toSpace, ":")
    docs <- tm_map(docs, toSpace, "'")
    docs <- tm_map(docs, toSpace, "'")
    docs <- tm_map(docs, toSpace, " -")
    # writeLines(as.character(docs[[4]])) #Good practice to check after each step. 
#Remove punctuation - replace punctuation marks with " "
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs,content_transformer(tolower)) #Transform to lower case
docs <- tm_map(docs, removeNumbers) #Strip digits
#Remove stopwords from standard stopword list (How to check this? How to add your own?)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace) #Strip whitespace (cosmetic?)
writeLines(as.character(docs[[4]])) # #inspect output

## student sandeep graduat student studi comput scienc clemson univers clemson south carolina born rais india school prestigi colleg undergradu colleg click html resum download postscript version divers end involv thing handl manag nice time friend unlik love drink exot mix drink humor humor web humor archiv adrian humor collect laugh web miscellan joke yahoo pictur pavel art galleri florida girl pit page beauti women supermodel imag yahoo index miscellan supermodel pictur indian travel travel agent map airlin number travel agent special travel india miscellan travel relat link miscellan collect unrel link bui car univers sandeep exclus guest book email talk invit sign guest book comment suggest feel free send email clemson talk click check log address live address crazi roommat road clemson phone
## course web oper system uniqu mwf tai recent explos interest world wide web result evolv set protocol protocol address tradit concern oper system interprocess commun resourc alloc secur gener context internet goal class provid understand current state art web oper system address problem solv provid matur gener purpos web oper system kei hypothesi design class issu address context web address tradit area oper system occasion read web relat paper bear understand current web problem class project report pointer internet research refer inform class syllabu read list schedul longer organ list paper class roster handout verif ssl protocol proof sketch guidelin final project talk report
## faculty rami melhem professor dept comput scienc phone fax research public optic interconnect parallel system fault toler parallel distribut system parallel distribut comput teach fault toler code theori introduct inform structur introduct oper system ongo fund research project profession active click send mail

#SnowballC for stemming
docs <- tm_map(docs,stemDocument) #Stem document
#         #some clean up
#         docs <- tm_map(docs, content_transformer(gsub), 
#                        pattern = "andgovern", replacement = "govern")         

dtm <- DocumentTermMatrix(docs) #Create document-term matrix
inspect(dtm[,1000:1010]) #inspect segment of document term matrix

## <<DocumentTermMatrix (documents: 4, terms: 11)>>
## Non-/sparse entries: 32/12
## Sparsity           : 27%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##                 Terms
## Docs             chain chair chairman challeng chalmer chamber chamberi
##   txtmining1.txt     5    55       13       21       1       0        2
##   txtmining2.txt     7    55       15       28       2       5        1
##   txtmining3.txt    15    81       15       23       1       9        2
##   txtmining4.txt     0     0        0        0       0       0        0
##                 Terms
## Docs             champaign champion chan chanc
##   txtmining1.txt         3        2    8     5
##   txtmining2.txt         9        5    4    19
##   txtmining3.txt         6       11    4     9
##   txtmining4.txt         0        0    0     0

#collapse matrix by summing over columns - this gets total counts (over all docs) for each term
    freq <- colSums(as.matrix(dtm))
    length(freq) #length should be total number of terms

## [1] 7099

    ord <- order(freq,decreasing=TRUE) #create sort order (asc) 
    freq[head(ord)] #inspect most frequently occurring terms

##   comput   scienc   system     univ research     page 
##     8320     4508     4270     4103     3454     3421

    freq[tail(ord)] #inspect least frequently occurring terms

##  yousef    zaki    zena    zhai zhihong   zwill 
##       3       3       3       3       3       3

#remove very frequent and very rare words
dtmr <-DocumentTermMatrix(docs, control=list(wordLengths=c(4, 20), 
                                             bounds = list(global = c(3,27))))
    freqr <- colSums(as.matrix(dtmr))
    length(freqr)

## [1] 4405

    ordr <- order(freqr,decreasing=TRUE) 
    freqr[head(ordr)] ; freqr[tail(ordr)]

##   comput   scienc   system     univ research     page 
##     8320     4508     4270     4103     3454     3421

##   wider winsock yechiam  yemini  yousef zhihong 
##       3       3       3       3       3       3

#list most frequent terms. Lower bound specified as second argument
findFreqTerms(dtmr,lowfreq=1000) # 80

##  [1] "algorithm" "assign"    "class"     "comput"    "data"     
##  [6] "depart"    "design"    "develop"   "distribut" "email"    
## [11] "engin"     "faculti"   "group"     "home"      "inform"   
## [16] "interest"  "languag"   "lectur"    "link"      "mail"     
## [21] "model"     "network"   "offic"     "page"      "paper"    
## [26] "parallel"  "problem"   "professor" "program"   "project"  
## [31] "research"  "scienc"    "softwar"   "student"   "system"   
## [36] "time"      "univ"      "work"

    #correlations
    head(findAssocs(dtmr,"access",0.9)) #0.6

##           access
## account        1
## achiev         1
## algebra        1
## algorithm      1
## alta           1
## anderson       1

    tail(findAssocs(dtmr,"comput",0.6))

##          comput
## zippel     0.61
## alistair   0.60
## almaden    0.60
## civil      0.60
## forc       0.60
## women      0.60

#histogram
wf=data.frame(term=names(freqr),occurrences=freqr)
    head(wf,10)

##                  term occurrences
## aaai             aaai         104
## aarhu           aarhu           3
## aaron           aaron          15
## abdelsalam abdelsalam           7
## abduct         abduct           9
## abelson       abelson          11
## abil             abil          47
## ablex           ablex           5
## abraham       abraham          14
## abroad         abroad           8

#plotting
p <- ggplot(subset(wf, freqr>1000), aes(term, occurrences))
    p <- p + geom_bar(stat="identity")
    p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
    p

#wordcloud
par(mfrow=c(1,2))
set.seed(42) #setting the same seed each time ensures consistent look across clouds
    wordcloud(names(freqr),freqr, min.freq=500) #limit words by specifying min frequency

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): comput could
## not be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): program could
## not be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): document could
## not be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): perform could
## not be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): paper could not
## be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): process could
## not be fit on page. It will not be plotted.

## Warning in wordcloud(names(freqr), freqr, min.freq = 500): manag could not
## be fit on page. It will not be plotted.

    #...add color
    wordcloud(names(freqr),freqr,min.freq=1000,colors=brewer.pal(6,"Dark2"))

## Warning in wordcloud(names(freqr), freqr, min.freq = 1000, colors =
## brewer.pal(6, : comput could not be fit on page. It will not be plotted.

TextMining Illustration

SA

December 11, 2015