The field of data mining has evolved from the disciplines of statistics and artificial intelligence.
Data mining is a rapidly growing field that is concerned with developing techniques to assist managers to make intelligent use of these repositories.
Online social media represent a fundamental shift of how information is being produced, transferred and consumed. User generated content in the form of blog posts, comments, and tweets establishes a connection between the producers and the consumers of information.
Tracking the pulse of the social media outlets, enables companies to gain feedback and insight in how to improve and market products better. For consumers, the abundance of information and opinions from diverse sources helps them tap into the wisdom of crowds, to aid in making more informed decisions.
library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3
# you need to use your own key, which can be obtain from tweeter
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
wsj_tweet <- userTimeline("wsj", n=5000)
## Warning in statusBase(cmd, params, n, 3200, ...): statuses/user_timeline
## has a cap of 3200 statuses, clipping
n.tweets = length(wsj_tweet)
wsj.df <- twListToDF(wsj_tweet)
for (i in c(1:2, 5)) {
cat(paste0("[", i, "] "))
writeLines(strwrap(wsj.df$text[i], 60))
}
## [1] Spending on microtransactions across all game platforms
## reached $71 billion world-wide last year
## https://t.co/TWBKo7e85O
## [2] Beijing airs allegations involving whistleblowing
## businessman living in New York https://t.co/EZsLx6NNKj
## [5] Newly public companies, eager to take advantage of rising
## U.S. stock prices, are quickly returning to the market
## https://t.co/QlBnFoTbRM
library(tm)
## Loading required package: NLP
myCorpus <- Corpus(VectorSource(wsj.df$text))
myCorpus <- tm_map(myCorpus,content_transformer(tolower))
removeURL <- function(x) gsub("http[^[:space:]]*","",x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
exceptions <- c("no","yes")
myStopwords <- setdiff(stopwords("en"), exceptions)
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpusCopy <- myCorpus
# Stemming and Stem Completion
myCorpus <- tm_map(myCorpus,stemDocument)
## inspect first 5 tweets
for (i in c(1:2,5)) {
cat(paste0("[",i,"]"))
writeLines(strwrap(as.character(myCorpus[[i]]),60))
}
## [1]spend microtransact across game platform reach billion
## worldwid last year
## [2]beij air alleg involv whistleblow businessman live new york
## [5]newli public compani eager take advantag rise us stock
## price quick return market
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x)," "))
x <- x[x != ""]
x <- stemCompletion(x, dictionary = dictionary)
x <- paste(x, sep="", collapse=" ")
PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<brexit")})
cat("Brexit occurance:" ,sum(unlist(MiningCases)))
## Brexit occurance: 17
MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<attack")})
cat("attack occurance:" ,sum(unlist(MiningCases)))
## attack occurance: 84
MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<oil")})
cat("oil occurance:" ,sum(unlist(MiningCases)))
## oil occurance: 30
A document-term matrix or term-document matrix is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms.
tdm <- TermDocumentMatrix(myCorpus,control=list(wordLength = c(1,Inf)))
idx <- which(dimnames(tdm)$Terms == "attack")
inspect(tdm[idx+(1:5),10:20])
## <<TermDocumentMatrix (terms: 5, documents: 11)>>
## Non-/sparse entries: 0/55
## Sparsity : 100%
## Maximal term length: 9
## Weighting : term frequency (tf)
##
## Docs
## Terms 10 11 12 13 14 15 16 17 18 19 20
## attempt 0 0 0 0 0 0 0 0 0 0 0
## attend 0 0 0 0 0 0 0 0 0 0 0
## attention 0 0 0 0 0 0 0 0 0 0 0
## attitudes 0 0 0 0 0 0 0 0 0 0 0
## attorney 0 0 0 0 0 0 0 0 0 0 0
# inspect frequent words
freq.terms <- findFreqTerms(tdm,lowfreq=40)
summary(freq.terms)
## Length Class Mode
## 35 character character
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 50)
df <- data.frame(term=names(term.freq),freq=term.freq)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(df,aes(x=term,y=freq))+geom_bar(stat="identity") + xlab("Terms") + ylab("Count") + coord_flip()
# which word are associated with "brexit"
findAssocs(tdm, c("oil", "stock", "attack"), c(0.1, 0.1, 0.1))
## $oil
## price opec aramco extend iea
## 0.35 0.25 0.23 0.23 0.23
## stateowned peak cut demand fall
## 0.23 0.22 0.21 0.20 0.20
## crude anticipation barkindo burning dakota
## 0.19 0.17 0.17 0.17 0.17
## danielyergin drawdown flailing glut keen
## 0.17 0.17 0.17 0.17 0.17
## louisiana mohammad nonopec nonshale opecled
## 0.17 0.17 0.17 0.17 0.17
## overhang pdvsa regain sheikhs skyrocketed
## 0.17 0.17 0.17 0.17 0.17
## squabble swung threeweek triggers wane
## 0.17 0.17 0.17 0.17 0.17
## ipo supplier saudi companies product
## 0.16 0.16 0.15 0.14 0.14
## access deeper diplomat heat output
## 0.12 0.12 0.12 0.12 0.12
## produced reading shale shipments western
## 0.12 0.12 0.12 0.12 0.12
## wider reduce well
## 0.12 0.10 0.10
##
## $stock
## fivesession streak extend track higher
## 0.31 0.29 0.25 0.22 0.21
## iea touch lose lower surge
## 0.21 0.21 0.19 0.18 0.18
## inch mac exchange abating advantage
## 0.17 0.17 0.16 0.15 0.15
## anticipation bearish bitcoinassets bogged bondsis
## 0.15 0.15 0.15 0.15 0.15
## bourses dax diverge eager fang
## 0.15 0.15 0.15 0.15 0.15
## global gold hoard ihope introduce
## 0.15 0.15 0.15 0.15 0.15
## jitters loss lull mostly muted
## 0.15 0.15 0.15 0.15 0.15
## pickers rebates regain selloff slightly
## 0.15 0.15 0.15 0.15 0.15
## somewhat teeth unicorns unisonhave woods
## 0.15 0.15 0.15 0.15 0.15
## alphabet edge european ipad start
## 0.14 0.14 0.14 0.14 0.14
## charts gain ahead product became
## 0.13 0.13 0.12 0.12 0.10
## bump dragged fast lack mixed
## 0.10 0.10 0.10 0.10 0.10
## plunge recover sank slowdown speed
## 0.10 0.10 0.10 0.10 0.10
## sterling volatile woo wrap
## 0.10 0.10 0.10 0.10
##
## $attack
## london bridge police terror
## 0.59 0.36 0.26 0.26
## isis knifewielding iran tehran
## 0.23 0.23 0.21 0.19
## taken terrorist unfold eyewitnesses
## 0.18 0.18 0.18 0.16
## gunman kill suspect wasnt
## 0.16 0.16 0.16 0.16
## casino extremist inhalation prevent
## 0.15 0.15 0.15 0.15
## resort rounds sighthe van
## 0.15 0.15 0.15 0.15
## connected saturday three arrest
## 0.14 0.14 0.14 0.13
## manila third hospital jihad
## 0.13 0.13 0.12 0.12
## last people plain propaganda
## 0.12 0.12 0.12 0.12
## witch addict alert cabins
## 0.12 0.11 0.11 0.11
## claim club counterterrorism dame
## 0.11 0.11 0.11 0.11
## dead deployed descent establish
## 0.11 0.11 0.11 0.11
## gambling gillette hammerwielding highrolling
## 0.11 0.11 0.11 0.11
## homegrown hotelandcasino intervene iraq
## 0.11 0.11 0.11 0.11
## islam istanbul know laments
## 0.11 0.11 0.11 0.11
## landmarks larger likeminded manchester
## 0.11 0.11 0.11 0.11
## moroccan mrs notre pilgrims
## 0.11 0.11 0.11 0.11
## planner razor rodrigo scene
## 0.11 0.11 0.11 0.11
## schick shaken shave sort
## 0.11 0.11 0.11 0.11
## sponsor stateclaimed swarmed thames
## 0.11 0.11 0.11 0.11
## trumpet veered villages westminster
## 0.11 0.11 0.11 0.11
## youssef zaghba butt coptic
## 0.11 0.11 0.10 0.10
## coverage dozen gym khuram
## 0.10 0.10 0.10 0.10
## melbourne pedestrians photo six
## 0.10 0.10 0.10 0.10
## suspend
## 0.10
# graph and Rgraphviz are not standand packages from CRAN you need to run:
# source("http://bioconductor.org/biocLite.R")
# biocLite(c("graph", "RBGL", "Rgraphviz"))
library(graph)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:twitteR':
##
## as.data.frame
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
library(Rgraphviz)
## Loading required package: grid
##
## Attaching package: 'Rgraphviz'
## The following object is masked from 'package:twitteR':
##
## name
plot(tdm,term=freq.terms,corThreshold=0.05,weighting=T)
options(warn=-1)
m <- as.matrix(tdm)
word.freq <- sort(rowSums(m),decreasing = T)
library(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
library(wordcloud)
wordcloud(words = names(word.freq),freq=word.freq,min.freq=10,random.order=F,colors = pal2)
## remove sparse terms
tdm2 <- removeSparseTerms(tdm,sparse=0.95)
m2 <- as.matrix(tdm2)
## cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix,method="ward.D")
## transpose the matrix to cluster tweets
m3 <- t(m2)
set.seed(17)
k=7
kmeansResult <- kmeans(m3,k)
round(kmeansResult$centers,digits=3)
## china opinion say trump write
## 1 0.000 0.000 1.000 0.200 0
## 2 1.000 0.000 0.067 0.000 0
## 3 0.011 0.607 0.000 0.000 1
## 4 0.021 0.068 0.000 1.016 0
## 5 0.000 0.000 0.000 0.000 0
## 6 0.000 1.000 0.023 0.000 0
## 7 0.000 0.667 0.000 1.000 1
for (i in 1:k) {
cat(paste("cluster",i,": ",sep=""))
s <- sort(kmeansResult$centers[i,],decreasing = T)
cat(names(s)[1:5], "\n")
}
## cluster1: say trump china opinion write
## cluster2: china say opinion trump write
## cluster3: write opinion china say trump
## cluster4: trump opinion china say write
## cluster5: china opinion say trump write
## cluster6: opinion say china trump write
## cluster7: trump write opinion china say
dtm <- as.DocumentTermMatrix(tdm)
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm.new <- dtm[rowTotals> 0, ]
library(topicmodels)
# find 7 topics
lda <- LDA(dtm.new,k=7)
# first 5 terms of every topic
term <- terms(lda,5)
term <- apply(term,MARGIN=2,paste,collapse=", ")
cat("The Topic examples are:")
## The Topic examples are:
term
## Topic 1
## "new, trump, first, write, bank"
## Topic 2
## "trump, say, people, attack, london"
## Topic 3
## "trump, presidencies, write, will, china"
## Topic 4
## "trump, may, one, get, attack"
## Topic 5
## "opinion, wall, take, street, say"
## Topic 6
## "china, new, police, opinion, attack"
## Topic 7
## "trump, opinion, london, china, attack"
# first topic identified for every tweet
library(data.table)
topic <- topics(lda,1)
topics <- data.frame(date=as.IDate(wsj.df$created),topic)
qplot(date,..count..,data=topics,geom="density",fill=term[topic],position="stack")