This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#TEXT MINING
#install.packages("tm")
#install.packages("wordcloud")
library(tm); library(SnowballC); library(ggplot2); library(wordcloud)
## Warning: package 'tm' was built under R version 3.1.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.1.3
## Warning: package 'SnowballC' was built under R version 3.1.1
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
##
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.1.2
docs <- Corpus(DirSource("~/Documents/DataScience/textMiningPractice/ds"))
#inspect(docs) #Check details
writeLines(as.character(docs[[4]])) #inspect a particular document
## student-sandeep graduat: student Studi Comput scienc clemson univers clemson south carolina born rais india school prestigi colleg undergradu colleg click html resum download postscript version divers end involv thing handl manag nice time friend unlik love drink exot mix drink humor humor web humor archiv adrian humor collect laugh web miscellan joke yahoo pictur pavel art galleri florida girl pit page beauti women supermodel imag yahoo index miscellan supermodel pictur indian travel travel agent map airlin number travel agent special travel india miscellan travel relat link miscellan collect unrel link bui car univers sandeep exclus guest book email talk invit sign guest book comment suggest feel free send email clemson talk click check log address live address crazi roommat road clemson phone
## course web oper system uniqu mwf tai recent explos interest world wide web result evolv set protocol protocol address tradit concern oper system interprocess commun resourc alloc secur gener context internet goal class provid understand current state art web oper system address problem solv provid matur gener purpos web oper system kei hypothesi design class issu address context web address tradit area oper system occasion read web relat paper bear understand current web problem class project report pointer internet research refer inform class syllabu read list schedul longer organ list paper class roster handout verif ssl protocol proof sketch guidelin final project talk report
## faculty rami melhem professor dept comput scienc phone fax research public optic interconnect parallel system fault toler parallel distribut system parallel distribut comput teach fault toler code theori introduct inform structur introduct oper system ongo fund research project Profession ACTIVE are CLICK- is SEND is Mail
#preprocessing
toSpace <- content_transformer(function(x, pattern) {return (gsub(pattern," ", x))})
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, " -")
# writeLines(as.character(docs[[4]])) #Good practice to check after each step.
#Remove punctuation - replace punctuation marks with " "
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs,content_transformer(tolower)) #Transform to lower case
docs <- tm_map(docs, removeNumbers) #Strip digits
#Remove stopwords from standard stopword list (How to check this? How to add your own?)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, stripWhitespace) #Strip whitespace (cosmetic?)
writeLines(as.character(docs[[4]])) # #inspect output
## student sandeep graduat student studi comput scienc clemson univers clemson south carolina born rais india school prestigi colleg undergradu colleg click html resum download postscript version divers end involv thing handl manag nice time friend unlik love drink exot mix drink humor humor web humor archiv adrian humor collect laugh web miscellan joke yahoo pictur pavel art galleri florida girl pit page beauti women supermodel imag yahoo index miscellan supermodel pictur indian travel travel agent map airlin number travel agent special travel india miscellan travel relat link miscellan collect unrel link bui car univers sandeep exclus guest book email talk invit sign guest book comment suggest feel free send email clemson talk click check log address live address crazi roommat road clemson phone
## course web oper system uniqu mwf tai recent explos interest world wide web result evolv set protocol protocol address tradit concern oper system interprocess commun resourc alloc secur gener context internet goal class provid understand current state art web oper system address problem solv provid matur gener purpos web oper system kei hypothesi design class issu address context web address tradit area oper system occasion read web relat paper bear understand current web problem class project report pointer internet research refer inform class syllabu read list schedul longer organ list paper class roster handout verif ssl protocol proof sketch guidelin final project talk report
## faculty rami melhem professor dept comput scienc phone fax research public optic interconnect parallel system fault toler parallel distribut system parallel distribut comput teach fault toler code theori introduct inform structur introduct oper system ongo fund research project profession active click send mail
#SnowballC for stemming
docs <- tm_map(docs,stemDocument) #Stem document
# #some clean up
# docs <- tm_map(docs, content_transformer(gsub),
# pattern = "andgovern", replacement = "govern")
dtm <- DocumentTermMatrix(docs) #Create document-term matrix
inspect(dtm[,1000:1010]) #inspect segment of document term matrix
## <<DocumentTermMatrix (documents: 4, terms: 11)>>
## Non-/sparse entries: 32/12
## Sparsity : 27%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Terms
## Docs chain chair chairman challeng chalmer chamber chamberi
## txtmining1.txt 5 55 13 21 1 0 2
## txtmining2.txt 7 55 15 28 2 5 1
## txtmining3.txt 15 81 15 23 1 9 2
## txtmining4.txt 0 0 0 0 0 0 0
## Terms
## Docs champaign champion chan chanc
## txtmining1.txt 3 2 8 5
## txtmining2.txt 9 5 4 19
## txtmining3.txt 6 11 4 9
## txtmining4.txt 0 0 0 0
#collapse matrix by summing over columns - this gets total counts (over all docs) for each term
freq <- colSums(as.matrix(dtm))
length(freq) #length should be total number of terms
## [1] 7099
ord <- order(freq,decreasing=TRUE) #create sort order (asc)
freq[head(ord)] #inspect most frequently occurring terms
## comput scienc system univ research page
## 8320 4508 4270 4103 3454 3421
freq[tail(ord)] #inspect least frequently occurring terms
## yousef zaki zena zhai zhihong zwill
## 3 3 3 3 3 3
#remove very frequent and very rare words
dtmr <-DocumentTermMatrix(docs, control=list(wordLengths=c(4, 20),
bounds = list(global = c(3,27))))
freqr <- colSums(as.matrix(dtmr))
length(freqr)
## [1] 4405
ordr <- order(freqr,decreasing=TRUE)
freqr[head(ordr)] ; freqr[tail(ordr)]
## comput scienc system univ research page
## 8320 4508 4270 4103 3454 3421
## wider winsock yechiam yemini yousef zhihong
## 3 3 3 3 3 3
#list most frequent terms. Lower bound specified as second argument
findFreqTerms(dtmr,lowfreq=1000) # 80
## [1] "algorithm" "assign" "class" "comput" "data"
## [6] "depart" "design" "develop" "distribut" "email"
## [11] "engin" "faculti" "group" "home" "inform"
## [16] "interest" "languag" "lectur" "link" "mail"
## [21] "model" "network" "offic" "page" "paper"
## [26] "parallel" "problem" "professor" "program" "project"
## [31] "research" "scienc" "softwar" "student" "system"
## [36] "time" "univ" "work"
#correlations
head(findAssocs(dtmr,"access",0.9)) #0.6
## access
## account 1
## achiev 1
## algebra 1
## algorithm 1
## alta 1
## anderson 1
tail(findAssocs(dtmr,"comput",0.6))
## comput
## zippel 0.61
## alistair 0.60
## almaden 0.60
## civil 0.60
## forc 0.60
## women 0.60
#histogram
wf=data.frame(term=names(freqr),occurrences=freqr)
head(wf,10)
## term occurrences
## aaai aaai 104
## aarhu aarhu 3
## aaron aaron 15
## abdelsalam abdelsalam 7
## abduct abduct 9
## abelson abelson 11
## abil abil 47
## ablex ablex 5
## abraham abraham 14
## abroad abroad 8
#plotting
p <- ggplot(subset(wf, freqr>1000), aes(term, occurrences))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
#wordcloud
par(mfrow=c(1,2))
set.seed(42) #setting the same seed each time ensures consistent look across clouds
wordcloud(names(freqr),freqr, min.freq=500) #limit words by specifying min frequency
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): comput could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): program could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): document could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): perform could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): paper could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): process could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(names(freqr), freqr, min.freq = 500): manag could not
## be fit on page. It will not be plotted.
#...add color
wordcloud(names(freqr),freqr,min.freq=1000,colors=brewer.pal(6,"Dark2"))
## Warning in wordcloud(names(freqr), freqr, min.freq = 1000, colors =
## brewer.pal(6, : comput could not be fit on page. It will not be plotted.