Text Mining example codes (tweets)

library(SnowballC)

## Option 1: retrieve tweets from Twitter
library(twitteR)

## Loading required package: ROAuth
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: digest
## Loading required package: rjson

# tweets <- userTimeline("RDataMining", n = 3200)

## Option 2: download @RDataMining tweets from RDataMining.com
setwd("~/work/datasciencecoursera/RDM/")
twitterF <- "./data/rdmTweets-201306.RData"
if(!file.exists(twitterF)){
    url <- "http://www.rdatamining.com/data/rdmTweets.RData"
    download.file(url, destfile =twitterF)
    url <- "http://www.rdatamining.com/data/rdmTweets-201306.RData"
    download.file(url, destfile =twitterF)

}

load tweets into R

load(file = twitterF)
(n.tweet <- length(tweets))

## [1] 320

## [1] 320
tweets[1:5]

## [[1]]
## [1] "RDataMining: Examples on calling Java code from R \nhttp://t.co/Yg1AivsO1R"
## 
## [[2]]
## [1] "RDataMining: Simulating Map-Reduce in R for Big Data Analysis Using Flights Data http://t.co/uIAh6PgvQv via @rbloggers"
## 
## [[3]]
## [1] "RDataMining: Job opportunity: Senior Analyst - Big Data at Wesfarmers Industrial &amp; Safety - Sydney Area, Australia #jobs http://t.co/gXogcvR4XT"
## 
## [[4]]
## [1] "RDataMining: CLAVIN: an open source software package for document geotagging and geoparsing http://t.co/gTGbTanKCI"
## 
## [[5]]
## [1] "RDataMining: An online book on Natural Language Processing (with Python) http://t.co/5j31FhtrA6"

Text Cleaning

# convert tweets to a data frame
# tweets.df <- do.call("rbind", lapply(tweets, as.data.frame))
tweets.df <- twListToDF(tweets)
dim(tweets.df)

## [1] 320  14

## [1] 320 14
library(tm)

## Loading required package: NLP

# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets.df$text))
# convert to lower case # myCorpus <- tm_map(myCorpus, tolower)
# tm v0.6
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation) 
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove URLs
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
### myCorpus <- tm_map(myCorpus, removeURL, lazy=TRUE) 
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))  #??
# add two extra stop words: 'available' and 'via'
myStopwords <- c(stopwords("english"), "available", "via")
# remove 'r' and 'big' from stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))
# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
#
## keep a copy of corpus to use later as a dictionary for stem
# completion
myCorpusCopy <- myCorpus
# stem words
myCorpus <- tm_map(myCorpus, stemDocument)

# inspect the first 5 documents (tweets) inspect(myCorpus[1:5]) 
# The code below is used for to make text fit for paper width 
for (i in 1:5) {
    cat(paste("[[", i, "]] ", sep = ""))
    #writeLines(myCorpus[[i]])
    writeLines(as.character(myCorpus[[i]]))
}

## [[1]] exampl  call java code  r 
## 
## [[2]] simul mapreduc  r  big data analysi use flight data   rblogger
## [[3]] job opportun senior analyst  big data  wesfarm industri amp safeti  sydney area australia job
## [[4]] clavin  open sourc softwar packag  document geotag  geopars
## [[5]]  onlin book  natur languag process  python

stem completion

# myCorpus <- tm_map(myCorpus, content_transformer(stemCompletion), dictionary = myCorpusCopy, lazy=TRUE)

count frequency of “mining”

#miningCases <- tm_map(myCorpusCopy$content,grep, pattern = "\\<mining")
#sum(unlist(miningCases))

## count frequency of "miners"
#minerCases <- tm_map(myCorpusCopy, grep, pattern = "\\<miners")
#sum(unlist(minerCases))


# # replace "miners" with "mining"
# myCorpus <- tm_map(myCorpus, gsub, pattern = "miners", replacement = "mining")

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm

## <<TermDocumentMatrix (terms: 861, documents: 320)>>
## Non-/sparse entries: 2506/273014
## Sparsity           : 99%
## Maximal term length: 25
## Weighting          : term frequency (tf)

## Freqency words and Association
idx <- which(dimnames(tdm)$Terms == "r")
inspect(tdm[idx + (0:5), 101:110])

## <<TermDocumentMatrix (terms: 6, documents: 10)>>
## Non-/sparse entries: 4/56
## Sparsity           : 93%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## 
##               Docs
## Terms          101 102 103 104 105 106 107 108 109 110
##   r              0   1   1   0   0   0   0   0   1   1
##   ramachandran   0   0   0   0   0   0   0   0   0   0
##   random         0   0   0   0   0   0   0   0   0   0
##   rank           0   0   0   0   0   0   0   0   0   0
##   rann           0   0   0   0   0   0   0   0   0   0
##   rapidmin       0   0   0   0   0   0   0   0   0   0

#inspect frequent words
(freq.terms <- findFreqTerms(tdm, lowfreq=15))

##  [1] "analysi"   "applic"    "big"       "book"      "code"     
##  [6] "comput"    "data"      "exampl"    "group"     "introduct"
## [11] "mine"      "network"   "packag"    "posit"     "r"        
## [16] "research"  "see"       "slide"     "social"    "tutori"   
## [21] "univers"   "use"

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >=5)
df <- data.frame(term = names(term.freq), freq = term.freq)

library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

ggplot(df, aes(x=term, y=freq)) + geom_bar(stat = "identity") + xlab("Terms") + ylab("Count") +coord_flip()

plot of chunk wordPlot

# which words are associated with 'r'?
findAssocs(tdm, "r", 0.2)

##           r
## exampl 0.33
## code   0.29

# which words are associated with 'mining'?
findAssocs(tdm, "mining", 0.25)

##            mining
## comprehens   1.00
## build        0.58
## project      0.58
## join         0.41
## rdatamin     0.37

library(graph)
library(Rgraphviz)

## Loading required package: grid
## 
## Attaching package: 'Rgraphviz'
## 
## The following object is masked from 'package:twitteR':
## 
##     name

plot(tdm, term = freq.terms, corThreshold = 0.12, weighting = T)

plot of chunk plot1

Word Cloud

library(wordcloud)

## Loading required package: RColorBrewer

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
          random.order = F)

plot of chunk wordCloud1

Clustering

# remove sparse terms
tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward.D")

plot(fit)
rect.hclust(fit, k = 6) # cut tree into 6 clusters

plot of chunk wordCluster

m3 <- t(m2) # transpose the matrix to cluster documents (tweets)
set.seed(122) # set a fixed random seed
k <- 6 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3) # cluster centers

##   analysi   big  book comput  data exampl  mine network packag posit     r
## 1   0.134 0.149 0.015  0.060 1.045  0.030 0.373   0.000  0.015 0.075 0.194
## 2   0.027 0.135 0.270  0.027 1.459  0.243 1.054   0.000  0.189 0.000 1.000
## 3   0.857 0.000 0.000  0.000 0.048  0.095 0.095   0.952  0.095 0.190 0.286
## 4   0.078 0.013 0.052  0.052 0.000  0.065 0.104   0.013  0.117 0.039 0.000
## 5   0.082 0.000 0.047  0.106 0.035  0.271 0.106   0.035  0.200 0.000 1.188
## 6   0.091 0.152 0.000  0.000 0.515  0.000 0.091   0.000  0.000 0.667 0.000
##   research slide social tutori univers   use
## 1    0.030 0.090  0.000  0.075   0.030 0.015
## 2    0.027 0.054  0.000  0.000   0.000 0.243
## 3    0.048 0.095  0.810  0.190   0.048 0.095
## 4    0.013 0.104  0.013  0.052   0.091 0.052
## 5    0.000 0.165  0.000  0.094   0.000 0.141
## 6    0.970 0.000  0.121  0.000   0.424 0.000

library(fpc)

## Error: there is no package called 'fpc'

# partitioning around medoids with estimation of number of clusters
pamResult <- pamk(m3, metric="manhattan")

## Error: could not find function "pamk"

k <- pamResult$nc # number of clusters identified

## Error: object 'pamResult' not found

pamResult <- pamResult$pamobject

## Error: object 'pamResult' not found

# print cluster medoids
for (i in 1:k) {
    cat("cluster", i, ": ",
        colnames(pamResult$medoids)[which(pamResult$medoids[i,]==1)], "\n")
}

## Error: object 'pamResult' not found

# plot clustering result
layout(matrix(c(1, 2), 1, 2)) # set to two graphs per page
plot(pamResult, col.p = pamResult$clustering)

## Error: error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'pamResult' not found

Topic Model

dtm <- as.DocumentTermMatrix(tdm)
library(topicmodels)

## Warning: package 'topicmodels' was built under R version 3.1.2

lda <- LDA(dtm, k = 8) # find 8 topics
term <- terms(lda, 4) # first 4 terms of every topic
term

##      Topic 1    Topic 2   Topic 3 Topic 4  Topic 5  Topic 6   Topic 7
## [1,] "data"     "mine"    "r"     "r"      "r"      "data"    "r"    
## [2,] "research" "univers" "data"  "exampl" "packag" "r"       "data" 
## [3,] "big"      "associ"  "use"   "data"   "group"  "cluster" "time" 
## [4,] "mine"     "lectur"  "mine"  "mine"   "map"    "applic"  "posit"
##      Topic 8  
## [1,] "analysi"
## [2,] "network"
## [3,] "social" 
## [4,] "tutori"

term <- apply(term, MARGIN = 2, paste, collapse = ", ")

# first topic identified for every document (tweet)
require(data.table) #fore IDate

## Loading required package: data.table

## Warning: package 'data.table' was built under R version 3.1.2

topic <- topics(lda, 1)
topics <- data.frame(date=as.IDate(tweets.df$created), topic)
qplot(date, ..count.., data=topics, geom="density",
      fill=term[topic], position="stack")

plot of chunk topicModel1