library(SnowballC)
## Option 1: retrieve tweets from Twitter
library(twitteR)
## Loading required package: ROAuth
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: digest
## Loading required package: rjson
# tweets <- userTimeline("RDataMining", n = 3200)
## Option 2: download @RDataMining tweets from RDataMining.com
setwd("~/work/datasciencecoursera/RDM/")
twitterF <- "./data/rdmTweets-201306.RData"
if(!file.exists(twitterF)){
url <- "http://www.rdatamining.com/data/rdmTweets.RData"
download.file(url, destfile =twitterF)
url <- "http://www.rdatamining.com/data/rdmTweets-201306.RData"
download.file(url, destfile =twitterF)
}
load(file = twitterF)
(n.tweet <- length(tweets))
## [1] 320
## [1] 320
tweets[1:5]
## [[1]]
## [1] "RDataMining: Examples on calling Java code from R \nhttp://t.co/Yg1AivsO1R"
##
## [[2]]
## [1] "RDataMining: Simulating Map-Reduce in R for Big Data Analysis Using Flights Data http://t.co/uIAh6PgvQv via @rbloggers"
##
## [[3]]
## [1] "RDataMining: Job opportunity: Senior Analyst - Big Data at Wesfarmers Industrial & Safety - Sydney Area, Australia #jobs http://t.co/gXogcvR4XT"
##
## [[4]]
## [1] "RDataMining: CLAVIN: an open source software package for document geotagging and geoparsing http://t.co/gTGbTanKCI"
##
## [[5]]
## [1] "RDataMining: An online book on Natural Language Processing (with Python) http://t.co/5j31FhtrA6"
# convert tweets to a data frame
# tweets.df <- do.call("rbind", lapply(tweets, as.data.frame))
tweets.df <- twListToDF(tweets)
dim(tweets.df)
## [1] 320 14
## [1] 320 14
library(tm)
## Loading required package: NLP
# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets.df$text))
# convert to lower case # myCorpus <- tm_map(myCorpus, tolower)
# tm v0.6
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
### myCorpus <- tm_map(myCorpus, removeURL, lazy=TRUE)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL)) #??
# add two extra stop words: 'available' and 'via'
myStopwords <- c(stopwords("english"), "available", "via")
# remove 'r' and 'big' from stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#
## keep a copy of corpus to use later as a dictionary for stem
# completion
myCorpusCopy <- myCorpus
# stem words
myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first 5 documents (tweets) inspect(myCorpus[1:5])
# The code below is used for to make text fit for paper width
for (i in 1:5) {
cat(paste("[[", i, "]] ", sep = ""))
#writeLines(myCorpus[[i]])
writeLines(as.character(myCorpus[[i]]))
}
## [[1]] exampl call java code r
##
## [[2]] simul mapreduc r big data analysi use flight data rblogger
## [[3]] job opportun senior analyst big data wesfarm industri amp safeti sydney area australia job
## [[4]] clavin open sourc softwar packag document geotag geopars
## [[5]] onlin book natur languag process python
# myCorpus <- tm_map(myCorpus, content_transformer(stemCompletion), dictionary = myCorpusCopy, lazy=TRUE)
#miningCases <- tm_map(myCorpusCopy$content,grep, pattern = "\\<mining")
#sum(unlist(miningCases))
## count frequency of "miners"
#minerCases <- tm_map(myCorpusCopy, grep, pattern = "\\<miners")
#sum(unlist(minerCases))
# # replace "miners" with "mining"
# myCorpus <- tm_map(myCorpus, gsub, pattern = "miners", replacement = "mining")
tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 861, documents: 320)>>
## Non-/sparse entries: 2506/273014
## Sparsity : 99%
## Maximal term length: 25
## Weighting : term frequency (tf)
## Freqency words and Association
idx <- which(dimnames(tdm)$Terms == "r")
inspect(tdm[idx + (0:5), 101:110])
## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 4/56
## Sparsity : 93%
## Maximal term length: 12
## Weighting : term frequency (tf)
##
## Docs
## Terms 101 102 103 104 105 106 107 108 109 110
## r 0 1 1 0 0 0 0 0 1 1
## ramachandran 0 0 0 0 0 0 0 0 0 0
## random 0 0 0 0 0 0 0 0 0 0
## rank 0 0 0 0 0 0 0 0 0 0
## rann 0 0 0 0 0 0 0 0 0 0
## rapidmin 0 0 0 0 0 0 0 0 0 0
#inspect frequent words
(freq.terms <- findFreqTerms(tdm, lowfreq=15))
## [1] "analysi" "applic" "big" "book" "code"
## [6] "comput" "data" "exampl" "group" "introduct"
## [11] "mine" "network" "packag" "posit" "r"
## [16] "research" "see" "slide" "social" "tutori"
## [21] "univers" "use"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >=5)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") +coord_flip()
# which words are associated with 'r'?
findAssocs(tdm, "r", 0.2)
## r
## exampl 0.33
## code 0.29
# which words are associated with 'mining'?
findAssocs(tdm, "mining", 0.25)
## mining
## comprehens 1.00
## build 0.58
## project 0.58
## join 0.41
## rdatamin 0.37
library(graph)
library(Rgraphviz)
## Loading required package: grid
##
## Attaching package: 'Rgraphviz'
##
## The following object is masked from 'package:twitteR':
##
## name
plot(tdm, term = freq.terms, corThreshold = 0.12, weighting = T)
library(wordcloud)
## Loading required package: RColorBrewer
m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
random.order = F)
# remove sparse terms
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward.D")
plot(fit)
rect.hclust(fit, k = 6) # cut tree into 6 clusters
m3 <- t(m2) # transpose the matrix to cluster documents (tweets)
set.seed(122) # set a fixed random seed
k <- 6 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3) # cluster centers
## analysi big book comput data exampl mine network packag posit r
## 1 0.134 0.149 0.015 0.060 1.045 0.030 0.373 0.000 0.015 0.075 0.194
## 2 0.027 0.135 0.270 0.027 1.459 0.243 1.054 0.000 0.189 0.000 1.000
## 3 0.857 0.000 0.000 0.000 0.048 0.095 0.095 0.952 0.095 0.190 0.286
## 4 0.078 0.013 0.052 0.052 0.000 0.065 0.104 0.013 0.117 0.039 0.000
## 5 0.082 0.000 0.047 0.106 0.035 0.271 0.106 0.035 0.200 0.000 1.188
## 6 0.091 0.152 0.000 0.000 0.515 0.000 0.091 0.000 0.000 0.667 0.000
## research slide social tutori univers use
## 1 0.030 0.090 0.000 0.075 0.030 0.015
## 2 0.027 0.054 0.000 0.000 0.000 0.243
## 3 0.048 0.095 0.810 0.190 0.048 0.095
## 4 0.013 0.104 0.013 0.052 0.091 0.052
## 5 0.000 0.165 0.000 0.094 0.000 0.141
## 6 0.970 0.000 0.121 0.000 0.424 0.000
library(fpc)
## Error: there is no package called 'fpc'
# partitioning around medoids with estimation of number of clusters
pamResult <- pamk(m3, metric="manhattan")
## Error: could not find function "pamk"
k <- pamResult$nc # number of clusters identified
## Error: object 'pamResult' not found
pamResult <- pamResult$pamobject
## Error: object 'pamResult' not found
# print cluster medoids
for (i in 1:k) {
cat("cluster", i, ": ",
colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
}
## Error: object 'pamResult' not found
# plot clustering result
layout(matrix(c(1, 2), 1, 2)) # set to two graphs per page
plot(pamResult, col.p = pamResult$clustering)
## Error: error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'pamResult' not found
dtm <- as.DocumentTermMatrix(tdm)
library(topicmodels)
## Warning: package 'topicmodels' was built under R version 3.1.2
lda <- LDA(dtm, k = 8) # find 8 topics
term <- terms(lda, 4) # first 4 terms of every topic
term
## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7
## [1,] "data" "mine" "r" "r" "r" "data" "r"
## [2,] "research" "univers" "data" "exampl" "packag" "r" "data"
## [3,] "big" "associ" "use" "data" "group" "cluster" "time"
## [4,] "mine" "lectur" "mine" "mine" "map" "applic" "posit"
## Topic 8
## [1,] "analysi"
## [2,] "network"
## [3,] "social"
## [4,] "tutori"
term <- apply(term, MARGIN = 2, paste, collapse = ", ")
# first topic identified for every document (tweet)
require(data.table) #fore IDate
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.1.2
topic <- topics(lda, 1)
topics <- data.frame(date=as.IDate(tweets.df$created), topic)
qplot(date, ..count.., data=topics, geom="density",
fill=term[topic], position="stack")