Data Mining

The field of data mining has evolved from the disciplines of statistics and artificial intelligence.

Data mining is a rapidly growing field that is concerned with developing techniques to assist managers to make intelligent use of these repositories.

Online social media represent a fundamental shift of how information is being produced, transferred and consumed. User generated content in the form of blog posts, comments, and tweets establishes a connection between the producers and the consumers of information.

Tracking the pulse of the social media outlets, enables companies to gain feedback and insight in how to improve and market products better. For consumers, the abundance of information and opinions from diverse sources helps them tap into the wisdom of crowds, to aid in making more informed decisions.

library(twitteR)
## Warning: package 'twitteR' was built under R version 3.3.3

Step 1: Load the data from Twitter API

# you need to use your own key, which can be obtain from tweeter
# api_key <- "xxxxxxxxxxxxxxxxxxxxxxxxx"
# api_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# access_token_secret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
wsj_tweet <- userTimeline("wsj", n=5000)
## Warning in statusBase(cmd, params, n, 3200, ...): statuses/user_timeline
## has a cap of 3200 statuses, clipping
n.tweets = length(wsj_tweet)
wsj.df <- twListToDF(wsj_tweet)


for (i in c(1:2, 5)) {
  cat(paste0("[", i, "] "))
  writeLines(strwrap(wsj.df$text[i], 60))
}
## [1] Spending on microtransactions across all game platforms
## reached $71 billion world-wide last year
## https://t.co/TWBKo7e85O
## [2] Beijing airs allegations involving whistleblowing
## businessman living in New York https://t.co/EZsLx6NNKj
## [5] Newly public companies, eager to take advantage of rising
## U.S. stock prices, are quickly returning to the market
## https://t.co/QlBnFoTbRM

Step 2: Text Cleaning

library(tm)
## Loading required package: NLP
myCorpus <- Corpus(VectorSource(wsj.df$text))

myCorpus <- tm_map(myCorpus,content_transformer(tolower))

removeURL <- function(x) gsub("http[^[:space:]]*","",x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*","",x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

exceptions   <- c("no","yes")
myStopwords <- setdiff(stopwords("en"), exceptions)

myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

myCorpus <- tm_map(myCorpus, stripWhitespace)

myCorpusCopy <- myCorpus

# Stemming and Stem Completion

myCorpus <- tm_map(myCorpus,stemDocument)


## inspect first 5 tweets

for (i in c(1:2,5)) {
  cat(paste0("[",i,"]"))
  writeLines(strwrap(as.character(myCorpus[[i]]),60))
}
## [1]spend microtransact across game platform reach billion
## worldwid last year
## [2]beij air alleg involv whistleblow businessman live new york
## [5]newli public compani eager take advantag rise us stock
## price quick return market
stemCompletion2 <- function(x,dictionary) {
  x <- unlist(strsplit(as.character(x)," "))
  x <- x[x != ""]
  x <- stemCompletion(x, dictionary = dictionary)
  x <- paste(x, sep="", collapse=" ")
  PlainTextDocument(stripWhitespace(x))
}
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))

Step 3: Data Mining: count frequence of word of interest such as “brexit”

MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<brexit")})
cat("Brexit occurance:" ,sum(unlist(MiningCases)))
## Brexit occurance: 17
MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<attack")})
cat("attack occurance:" ,sum(unlist(MiningCases)))
## attack occurance: 84
MiningCases <- lapply(myCorpusCopy, function(x){grep(as.character(x),pattern = "\\<oil")})
cat("oil occurance:" ,sum(unlist(MiningCases)))
## oil occurance: 30

Step 4: Create Term Document Matrix

A document-term matrix or term-document matrix is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. In a document-term matrix, rows correspond to documents in the collection and columns correspond to terms.

tdm <- TermDocumentMatrix(myCorpus,control=list(wordLength = c(1,Inf)))

idx <- which(dimnames(tdm)$Terms == "attack")
inspect(tdm[idx+(1:5),10:20])
## <<TermDocumentMatrix (terms: 5, documents: 11)>>
## Non-/sparse entries: 0/55
## Sparsity           : 100%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##            Docs
## Terms       10 11 12 13 14 15 16 17 18 19 20
##   attempt    0  0  0  0  0  0  0  0  0  0  0
##   attend     0  0  0  0  0  0  0  0  0  0  0
##   attention  0  0  0  0  0  0  0  0  0  0  0
##   attitudes  0  0  0  0  0  0  0  0  0  0  0
##   attorney   0  0  0  0  0  0  0  0  0  0  0

Step 5-1: Visulization of frequent occurnce words

# inspect frequent words

freq.terms <- findFreqTerms(tdm,lowfreq=40)
summary(freq.terms)
##    Length     Class      Mode 
##        35 character character
term.freq <- rowSums(as.matrix(tdm))

term.freq <- subset(term.freq, term.freq > 50)
df <- data.frame(term=names(term.freq),freq=term.freq)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
ggplot(df,aes(x=term,y=freq))+geom_bar(stat="identity") + xlab("Terms") + ylab("Count") + coord_flip()

Step 5-2: Visulization of words association

# which word are associated with "brexit"

findAssocs(tdm, c("oil", "stock", "attack"), c(0.1, 0.1, 0.1))
## $oil
##        price         opec       aramco       extend          iea 
##         0.35         0.25         0.23         0.23         0.23 
##   stateowned         peak          cut       demand         fall 
##         0.23         0.22         0.21         0.20         0.20 
##        crude anticipation     barkindo      burning       dakota 
##         0.19         0.17         0.17         0.17         0.17 
## danielyergin     drawdown     flailing         glut         keen 
##         0.17         0.17         0.17         0.17         0.17 
##    louisiana     mohammad      nonopec     nonshale      opecled 
##         0.17         0.17         0.17         0.17         0.17 
##     overhang        pdvsa       regain      sheikhs  skyrocketed 
##         0.17         0.17         0.17         0.17         0.17 
##     squabble        swung    threeweek     triggers         wane 
##         0.17         0.17         0.17         0.17         0.17 
##          ipo     supplier        saudi    companies      product 
##         0.16         0.16         0.15         0.14         0.14 
##       access       deeper     diplomat         heat       output 
##         0.12         0.12         0.12         0.12         0.12 
##     produced      reading        shale    shipments      western 
##         0.12         0.12         0.12         0.12         0.12 
##        wider       reduce         well 
##         0.12         0.10         0.10 
## 
## $stock
##   fivesession        streak        extend         track        higher 
##          0.31          0.29          0.25          0.22          0.21 
##           iea         touch          lose         lower         surge 
##          0.21          0.21          0.19          0.18          0.18 
##          inch           mac      exchange       abating     advantage 
##          0.17          0.17          0.16          0.15          0.15 
##  anticipation       bearish bitcoinassets        bogged       bondsis 
##          0.15          0.15          0.15          0.15          0.15 
##       bourses           dax       diverge         eager          fang 
##          0.15          0.15          0.15          0.15          0.15 
##        global          gold         hoard         ihope     introduce 
##          0.15          0.15          0.15          0.15          0.15 
##       jitters          loss          lull        mostly         muted 
##          0.15          0.15          0.15          0.15          0.15 
##       pickers       rebates        regain       selloff      slightly 
##          0.15          0.15          0.15          0.15          0.15 
##      somewhat         teeth      unicorns    unisonhave         woods 
##          0.15          0.15          0.15          0.15          0.15 
##      alphabet          edge      european          ipad         start 
##          0.14          0.14          0.14          0.14          0.14 
##        charts          gain         ahead       product        became 
##          0.13          0.13          0.12          0.12          0.10 
##          bump       dragged          fast          lack         mixed 
##          0.10          0.10          0.10          0.10          0.10 
##        plunge       recover          sank      slowdown         speed 
##          0.10          0.10          0.10          0.10          0.10 
##      sterling      volatile           woo          wrap 
##          0.10          0.10          0.10          0.10 
## 
## $attack
##           london           bridge           police           terror 
##             0.59             0.36             0.26             0.26 
##             isis    knifewielding             iran           tehran 
##             0.23             0.23             0.21             0.19 
##            taken        terrorist           unfold     eyewitnesses 
##             0.18             0.18             0.18             0.16 
##           gunman             kill          suspect            wasnt 
##             0.16             0.16             0.16             0.16 
##           casino        extremist       inhalation          prevent 
##             0.15             0.15             0.15             0.15 
##           resort           rounds          sighthe              van 
##             0.15             0.15             0.15             0.15 
##        connected         saturday            three           arrest 
##             0.14             0.14             0.14             0.13 
##           manila            third         hospital            jihad 
##             0.13             0.13             0.12             0.12 
##             last           people            plain       propaganda 
##             0.12             0.12             0.12             0.12 
##            witch           addict            alert           cabins 
##             0.12             0.11             0.11             0.11 
##            claim             club counterterrorism             dame 
##             0.11             0.11             0.11             0.11 
##             dead         deployed          descent        establish 
##             0.11             0.11             0.11             0.11 
##         gambling         gillette   hammerwielding      highrolling 
##             0.11             0.11             0.11             0.11 
##        homegrown   hotelandcasino        intervene             iraq 
##             0.11             0.11             0.11             0.11 
##            islam         istanbul             know          laments 
##             0.11             0.11             0.11             0.11 
##        landmarks           larger       likeminded       manchester 
##             0.11             0.11             0.11             0.11 
##         moroccan              mrs            notre         pilgrims 
##             0.11             0.11             0.11             0.11 
##          planner            razor          rodrigo            scene 
##             0.11             0.11             0.11             0.11 
##           schick           shaken            shave             sort 
##             0.11             0.11             0.11             0.11 
##          sponsor     stateclaimed          swarmed           thames 
##             0.11             0.11             0.11             0.11 
##          trumpet           veered         villages      westminster 
##             0.11             0.11             0.11             0.11 
##          youssef           zaghba             butt           coptic 
##             0.11             0.11             0.10             0.10 
##         coverage            dozen              gym           khuram 
##             0.10             0.10             0.10             0.10 
##        melbourne      pedestrians            photo              six 
##             0.10             0.10             0.10             0.10 
##          suspend 
##             0.10
# graph and Rgraphviz are not standand packages from CRAN you need to run:
# source("http://bioconductor.org/biocLite.R")
# biocLite(c("graph", "RBGL", "Rgraphviz"))
library(graph)
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:twitteR':
## 
##     as.data.frame
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min
library(Rgraphviz)
## Loading required package: grid
## 
## Attaching package: 'Rgraphviz'
## The following object is masked from 'package:twitteR':
## 
##     name
plot(tdm,term=freq.terms,corThreshold=0.05,weighting=T)

Step 5-3: Visulization of frequent words by word cloud

options(warn=-1)
m <- as.matrix(tdm)
word.freq <- sort(rowSums(m),decreasing = T)

library(RColorBrewer) 

pal2 <- brewer.pal(8,"Dark2")

library(wordcloud)
wordcloud(words = names(word.freq),freq=word.freq,min.freq=10,random.order=F,colors = pal2)

Step 5-4: Clustering

## remove sparse terms

tdm2 <- removeSparseTerms(tdm,sparse=0.95)
m2 <- as.matrix(tdm2)

## cluster terms

distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix,method="ward.D")

## transpose the matrix to cluster tweets

m3 <- t(m2)
set.seed(17)
k=7
kmeansResult <- kmeans(m3,k)
round(kmeansResult$centers,digits=3)
##   china opinion   say trump write
## 1 0.000   0.000 1.000 0.200     0
## 2 1.000   0.000 0.067 0.000     0
## 3 0.011   0.607 0.000 0.000     1
## 4 0.021   0.068 0.000 1.016     0
## 5 0.000   0.000 0.000 0.000     0
## 6 0.000   1.000 0.023 0.000     0
## 7 0.000   0.667 0.000 1.000     1
for (i in 1:k) {
  cat(paste("cluster",i,": ",sep=""))
  s <- sort(kmeansResult$centers[i,],decreasing = T)
  cat(names(s)[1:5], "\n")
}
## cluster1: say trump china opinion write 
## cluster2: china say opinion trump write 
## cluster3: write opinion china say trump 
## cluster4: trump opinion china say write 
## cluster5: china opinion say trump write 
## cluster6: opinion say china trump write 
## cluster7: trump write opinion china say

Step 6: Topic Modelling

dtm <- as.DocumentTermMatrix(tdm)

rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm.new   <- dtm[rowTotals> 0, ]  

library(topicmodels)
# find 7 topics
lda <- LDA(dtm.new,k=7)
# first 5 terms of every topic
term <- terms(lda,5)

term <- apply(term,MARGIN=2,paste,collapse=", ")

cat("The Topic examples are:")
## The Topic examples are:
term 
##                                   Topic 1 
##          "new, trump, first, write, bank" 
##                                   Topic 2 
##      "trump, say, people, attack, london" 
##                                   Topic 3 
## "trump, presidencies, write, will, china" 
##                                   Topic 4 
##            "trump, may, one, get, attack" 
##                                   Topic 5 
##        "opinion, wall, take, street, say" 
##                                   Topic 6 
##     "china, new, police, opinion, attack" 
##                                   Topic 7 
##   "trump, opinion, london, china, attack"
# first topic identified for every tweet

library(data.table)
topic <- topics(lda,1)
topics <- data.frame(date=as.IDate(wsj.df$created),topic)
qplot(date,..count..,data=topics,geom="density",fill=term[topic],position="stack")