Load tweets.
if (!file.exists("rdmTweets.RData")) {
library(httr)
resp <- GET("http://www.rdatamining.com/data/rdmTweets.RData")
writeBin(content(resp, 'raw'), "rdmTweets.RData")
}
load("rdmTweets.RData")
str(rdmTweets, list.len = 2)
## List of 154
## $ :Reference class 'status' [package "twitteR"] with 10 fields
## ..$ text : chr "Postdoc/Research Scientist Position on Big Data at MIT http://t.co/hZ1ojAW2"
## ..$ favorited : logi TRUE
## .. [list output truncated]
## ..and 33 methods, of which 22 are possibly relevant:
## .. getCreated, getFavorited, getId, getReplyToSID, getReplyToSN,
## .. getReplyToUID, getScreenName, getStatusSource, getText,
## .. getTruncated, initialize, setCreated, setFavorited, setId,
## .. setReplyToSID, setReplyToSN, setReplyToUID, setScreenName,
## .. setStatusSource, setText, setTruncated, toDataFrame
## $ :Reference class 'status' [package "twitteR"] with 10 fields
## ..$ text : chr "Research scientist position for privacy-preserving data publishing, Singapore http://t.co/GPA0TyG5"
## ..$ favorited : logi TRUE
## .. [list output truncated]
## ..and 33 methods, of which 22 are possibly relevant:
## .. getCreated, getFavorited, getId, getReplyToSID, getReplyToSN,
## .. getReplyToUID, getScreenName, getStatusSource, getText,
## .. getTruncated, initialize, setCreated, setFavorited, setId,
## .. setReplyToSID, setReplyToSN, setReplyToUID, setScreenName,
## .. setStatusSource, setText, setTruncated, toDataFrame
## [list output truncated]
Transform the list of tweets into dataframe.
do.call passes a lost of arguments to a function. Eg. do.call(func, list(1,2,3)) is equivalent to func(1,2,3).
df <- do.call(rbind, lapply(rdmTweets, function(t) {t$toDataFrame()}))
dim(df)
## [1] 154 10
Transform the dataframe into corpus, so that the documents can be processed by functions provided by package tm.
library(tm)
## Loading required package: NLP
myCorpus <- Corpus(VectorSource(df$text))
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] Postdoc/Research Scientist Position on Big Data at MIT http://t.co/hZ1ojAW2
## [2] Research scientist position for privacy-preserving data publishing, Singapore http://t.co/GPA0TyG5
## [3] Easier Parallel Computing in R with snowfall and sfCluster http://t.co/BPcinvzK
## [4] Tutorial: Parallel computing using R package snowfall http://t.co/CHBCyr76
## [5] handling big data: Interacting with Data using the filehash Package for R http://t.co/7RB3sChx
getTransformation list all transformations provided by package tm.
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
Convert the documents to lowercase, remove urls, remove punctuations, and remove numbers.
myCorpus <- tm_map(myCorpus, PlainTextDocument)
removeUrl <- function(x) {gsub("http://[[:alnum:]./]*", "", x)}
myCorpus <- tm_map(myCorpus, content_transformer(removeUrl))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
tm has predefined stop words, retrievable from stopwords function. Remove these stop words.
myStopWords <- c(stopwords(kind="en"), "via", "available")
myStopWords <- setdiff(myStopWords, c("r", "big"))
myCorpus <- tm_map(myCorpus, removeWords, myStopWords)
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] postdocresearch scientist position big data mit
## [2] research scientist position privacypreserving data publishing singapore
## [3] easier parallel computing r snowfall sfcluster
## [4] tutorial parallel computing using r package snowfall
## [5] handling big data interacting data using filehash package r
First stem the documents, then complete the stems using the unstemed corpus as the dictionary.
myCorpus.dict <- myCorpus
myCorpus <- tm_map(myCorpus, stemDocument)
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] postdocresearch scientist posit big data mit
## [2] research scientist posit privacypreserv data publish singapor
## [3] easier parallel comput r snowfal sfcluster
## [4] tutori parallel comput use r packag snowfal
## [5] handl big data interact data use filehash packag r
myCorpus <- tm_map(myCorpus, function(x, dictionary,type='prevalent')
unlist(lapply(x, function(xx){paste(stemCompletion(unlist(strsplit(xx, split=" ")), dictionary = dictionary, type=type), collapse=" ")
})), dictionary=myCorpus.dict,type='prevalent')
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] postdocresearch scientist position big data mit
## [2] research scientist position privacypreserving data published singapore
## [3] easier parallel computer r snowfall sfcluster
## [4] tutorial parallel computer use r package snowfall
## [5] handling big data interacting data use filehash package r
The word mine is completed as miners, not mining. We manually replace them.
stemCompletion(c("mine"), dictionary = myCorpus.dict, type='prevalent')
## mine
## "miners"
myCorpus <- tm_map(myCorpus, gsub, pattern="miners", replacement="mining")
inspect(myCorpus[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] postdocresearch scientist position big data mit
## [2] research scientist position privacypreserving data published singapore
## [3] easier parallel computer r snowfall sfcluster
## [4] tutorial parallel computer use r package snowfall
## [5] handling big data interacting data use filehash package r
Build a term-document matrix.
myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1, Inf)))
myTdm
## <<TermDocumentMatrix (terms: 498, documents: 154)>>
## Non-/sparse entries: 1168/75524
## Sparsity : 98%
## Maximal term length: 35
## Weighting : term frequency (tf)
inspect(myTdm[which(rownames(myTdm)=="r")+(0:5), 1:10])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 18/42
## Sparsity : 70%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1 10 2 3 4 5 6 7 8 9
## package 0 0 0 0 1 1 0 0 0 0
## r 0 1 0 1 1 1 1 1 1 1
## sfcluster 0 0 0 1 0 0 0 0 0 0
## snowfall 0 0 0 1 1 0 1 0 0 0
## tutorial 0 0 0 0 1 0 0 0 0 0
## use 0 0 0 0 1 1 1 0 0 0
Find frequent terms. Use barplot or ggplot2::qplot to plot the barchart.
las=2 makes the tick labels perpendicular to the axis. horiz=T swaps x- and y-axis.
freqTerms <- rowSums(as.matrix(myTdm))
freqTerms <- freqTerms[freqTerms>=10]
freqTerms <- sort(freqTerms)
barplot(freqTerms, horiz=T, las=2)
ggplot::qplot will order the x axis. We can force its order by converting the dimension to a factor, and specify its level order.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
qplot(x=factor(names(freqTerms), levels=names(freqTerms)), weight=freqTerms, geom='bar', xlab="Terms")+coord_flip()
Find word associations.
findAssocs(myTdm, 'mining', corlimit=0.25)
## $mining
## data mahout recommendation sets supports
## 0.55 0.39 0.39 0.39 0.39
## frequent itemset card function reference
## 0.35 0.34 0.29 0.29 0.29
## text
## 0.26
Word cloud
freqTerms <- rowSums(as.matrix(myTdm))
freqTerms <- sort(freqTerms)
set.seed(375)
graylevel <- gray((1-(freqTerms+40)/(max(freqTerms)+40))**2)
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(words=names(freqTerms), freq=freqTerms, min.freq=3, random.order=F, colors=graylevel)