twitter_analysis.R

#install.packages("twitteR")
library(twitteR)
#Example from http://www.rdatamining.com/examples/text-mining
#https://dev.twitter.com/
#http://geoffjentry.hexdump.org/twitteR.pdf
#https://twitter.com/apps/new
#>setup_twitter_oauth("APIkey","APIsecret","Accesstoken","Accesssecret “)

setup_twitter_oauth("PETxc9u6KqENPByL805wlFO1M","ptysLxcJo6ZSBdl6CIM1Nhdv7w16oSnsRe0PI15jwvieyG9biw","3232641518-5AERq2G5xQHWKVsQWy0pKNZYCixFQH6d77CSRng","Tkt4UHBMXwZqtlTHCx8gw36h2VKU1RGAZrhiA4uCmWxgA")

## [1] "Using direct authentication"

rdmTweets <- userTimeline("cia", n=500)
rdmTweets[1:3]

## [[1]]
## [1] "CIA: CIA #Museum Artifact of the Week: Afghan Hat\nA gift from Afghan President Karzai to former DCI George Tenet https://t.co/zUAZ9OqMQB"
## 
## [[2]]
## [1] "CIA: ICYMI:\nNew Anthology: \nCIA &amp; the Wars in Southeast Asia, 1947-75\n\n41 #unclassified articles &amp; more! https://t.co/ukmH3tTIoY"
## 
## [[3]]
## [1] "CIA: ICYMI:\nNew #Unclassified \"Studies in Intel\":\n-Intel for Warfighter\n-Why Bad Things Happen to Good Analysts\n-&amp; more! https://t.co/pNPryg92u1"

df <- do.call("rbind", lapply(rdmTweets, as.data.frame))


library(tm)

## Loading required package: NLP

Corpus1=Corpus(VectorSource(df$text))
Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
Corpus1 <- tm_map(Corpus1, stemDocument)  
Corpus1 <- tm_map(Corpus1, stripWhitespace)   
Corpus1 <- tm_map(Corpus1, PlainTextDocument)

dtm <- DocumentTermMatrix(Corpus1)

tdm <- TermDocumentMatrix(Corpus1)
matx1=as.matrix(tdm)
sort1=sort(rowSums(matx1),decreasing=T)
di=data.frame(Word=names(sort1),Frequency=sort1)

#install.packages("devtools")
library(devtools)
#install_github('sentiment140', 'okugami79')
library(sentiment)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: rjson

## Loading required package: plyr

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

a=sentiment(di$Word)
table(a$polarity)

## 
## negative  neutral 
##        2     1005

library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Set1"))

findFreqTerms(dtm, lowfreq=10)

##  [1] "amp"        "artifact"   "cia"        "icymi"      "inmemoriam"
##  [6] "intel"      "intelcon"   "museum"     "oss"        "pdb"       
## [11] "week"

findAssocs(dtm, 'cia', 0.30)

## $cia
## artifact   museum     week 
##     0.46     0.46     0.41

#dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)   

library(cluster)   
d <- dist(t(dtm), method="euclidian")   # First calculate distance between words
fit <- hclust(d=d, method="ward")

## The "ward" method has been renamed to "ward.D"; note new "ward.D2"

plot.new()
plot(fit, hang=-1)
groups <- cutree(fit, k=5)   # "k=" defines the number of clusters you are using   
rect.hclust(fit, k=5, border="red") # draw dendogram with red borders around the 5 clusters   

### K-means clustering   
library(fpc)

library(cluster)  
d <- dist(t(dtm), method="euclidian")   
kfit <- kmeans(d, 2)   
clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0)

twitter_analysis.R

dell

Sat Oct 22 10:23:59 2016