Text mining

Etracting twitter data

Load tweets.

if (!file.exists("rdmTweets.RData")) {
  library(httr)
  resp <- GET("http://www.rdatamining.com/data/rdmTweets.RData")
  writeBin(content(resp, 'raw'), "rdmTweets.RData")
}
load("rdmTweets.RData")
str(rdmTweets, list.len = 2)
## List of 154
##  $ :Reference class 'status' [package "twitteR"] with 10 fields
##   ..$ text        : chr "Postdoc/Research Scientist Position on Big Data at MIT http://t.co/hZ1ojAW2"
##   ..$ favorited   : logi TRUE
##   .. [list output truncated]
##   ..and 33 methods, of which 22 are  possibly relevant:
##   ..  getCreated, getFavorited, getId, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getScreenName, getStatusSource, getText,
##   ..  getTruncated, initialize, setCreated, setFavorited, setId,
##   ..  setReplyToSID, setReplyToSN, setReplyToUID, setScreenName,
##   ..  setStatusSource, setText, setTruncated, toDataFrame
##  $ :Reference class 'status' [package "twitteR"] with 10 fields
##   ..$ text        : chr "Research scientist position for privacy-preserving data publishing, Singapore http://t.co/GPA0TyG5"
##   ..$ favorited   : logi TRUE
##   .. [list output truncated]
##   ..and 33 methods, of which 22 are  possibly relevant:
##   ..  getCreated, getFavorited, getId, getReplyToSID, getReplyToSN,
##   ..  getReplyToUID, getScreenName, getStatusSource, getText,
##   ..  getTruncated, initialize, setCreated, setFavorited, setId,
##   ..  setReplyToSID, setReplyToSN, setReplyToUID, setScreenName,
##   ..  setStatusSource, setText, setTruncated, toDataFrame
##   [list output truncated]

Transform the list of tweets into dataframe.

do.call passes a lost of arguments to a function. Eg. do.call(func, list(1,2,3)) is equivalent to func(1,2,3).

df <- do.call(rbind, lapply(rdmTweets, function(t) {t$toDataFrame()}))
dim(df)
## [1] 154  10

Transform the dataframe into corpus, so that the documents can be processed by functions provided by package tm.

library(tm)
## Loading required package: NLP
myCorpus <- Corpus(VectorSource(df$text))
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] Postdoc/Research Scientist Position on Big Data at MIT http://t.co/hZ1ojAW2                       
## [2] Research scientist position for privacy-preserving data publishing, Singapore http://t.co/GPA0TyG5
## [3] Easier Parallel Computing in R with snowfall and sfCluster http://t.co/BPcinvzK                   
## [4] Tutorial: Parallel computing using R package snowfall http://t.co/CHBCyr76                        
## [5] handling big data: Interacting with Data using the filehash Package for R http://t.co/7RB3sChx

getTransformation list all transformations provided by package tm.

getTransformations()
## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

Convert the documents to lowercase, remove urls, remove punctuations, and remove numbers.

myCorpus <- tm_map(myCorpus, PlainTextDocument)
removeUrl <- function(x) {gsub("http://[[:alnum:]./]*", "", x)}
myCorpus <- tm_map(myCorpus, content_transformer(removeUrl))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)

tm has predefined stop words, retrievable from stopwords function. Remove these stop words.

myStopWords <- c(stopwords(kind="en"), "via", "available")
myStopWords <- setdiff(myStopWords, c("r", "big"))
myCorpus <- tm_map(myCorpus, removeWords, myStopWords)

inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] postdocresearch scientist position  big data  mit                        
## [2] research scientist position  privacypreserving data publishing singapore 
## [3] easier parallel computing  r  snowfall  sfcluster                        
## [4] tutorial parallel computing using r package snowfall                     
## [5] handling big data interacting  data using  filehash package  r

First stem the documents, then complete the stems using the unstemed corpus as the dictionary.

myCorpus.dict <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] postdocresearch scientist posit big data mit                 
## [2] research scientist posit privacypreserv data publish singapor
## [3] easier parallel comput r snowfal sfcluster                   
## [4] tutori parallel comput use r packag snowfal                  
## [5] handl big data interact data use filehash packag r
myCorpus <- tm_map(myCorpus, function(x, dictionary,type='prevalent')
  unlist(lapply(x, function(xx){paste(stemCompletion(unlist(strsplit(xx, split=" ")), dictionary = dictionary, type=type), collapse=" ")
  })), dictionary=myCorpus.dict,type='prevalent')

inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] postdocresearch scientist position big data mit                       
## [2] research scientist position privacypreserving data published singapore
## [3] easier parallel computer r snowfall sfcluster                         
## [4] tutorial parallel computer use r package snowfall                     
## [5] handling big data interacting data use filehash package r

The word mine is completed as miners, not mining. We manually replace them.

stemCompletion(c("mine"), dictionary = myCorpus.dict, type='prevalent')
##     mine 
## "miners"
myCorpus <- tm_map(myCorpus, gsub, pattern="miners", replacement="mining")
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] postdocresearch scientist position big data mit                       
## [2] research scientist position privacypreserving data published singapore
## [3] easier parallel computer r snowfall sfcluster                         
## [4] tutorial parallel computer use r package snowfall                     
## [5] handling big data interacting data use filehash package r

Build a term-document matrix.

myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1, Inf)))
myTdm
## <<TermDocumentMatrix (terms: 498, documents: 154)>>
## Non-/sparse entries: 1168/75524
## Sparsity           : 98%
## Maximal term length: 35
## Weighting          : term frequency (tf)
inspect(myTdm[which(rownames(myTdm)=="r")+(0:5), 1:10])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 18/42
## Sparsity           : 70%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##            Docs
## Terms       1 10 2 3 4 5 6 7 8 9
##   package   0  0 0 0 1 1 0 0 0 0
##   r         0  1 0 1 1 1 1 1 1 1
##   sfcluster 0  0 0 1 0 0 0 0 0 0
##   snowfall  0  0 0 1 1 0 1 0 0 0
##   tutorial  0  0 0 0 1 0 0 0 0 0
##   use       0  0 0 0 1 1 1 0 0 0

Find frequent terms. Use barplot or ggplot2::qplot to plot the barchart.

las=2 makes the tick labels perpendicular to the axis. horiz=T swaps x- and y-axis.

freqTerms <- rowSums(as.matrix(myTdm))

freqTerms <- freqTerms[freqTerms>=10]
freqTerms <- sort(freqTerms)
barplot(freqTerms, horiz=T, las=2)

ggplot::qplot will order the x axis. We can force its order by converting the dimension to a factor, and specify its level order.

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
qplot(x=factor(names(freqTerms), levels=names(freqTerms)), weight=freqTerms, geom='bar', xlab="Terms")+coord_flip()

Find word associations.

findAssocs(myTdm, 'mining', corlimit=0.25)
## $mining
##           data         mahout recommendation           sets       supports 
##           0.55           0.39           0.39           0.39           0.39 
##       frequent        itemset           card       function      reference 
##           0.35           0.34           0.29           0.29           0.29 
##           text 
##           0.26

Word cloud

freqTerms <- rowSums(as.matrix(myTdm))
freqTerms <- sort(freqTerms)
set.seed(375)
graylevel <- gray((1-(freqTerms+40)/(max(freqTerms)+40))**2)

library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words=names(freqTerms), freq=freqTerms, min.freq=3, random.order=F, colors=graylevel)