# the twitteR package makes it very easy to grab data on Twitte.
library(twitteR)
library(plyr)
library(ggplot2)
library(dplyr)
library(tm)
library(SnowballC)
library(wordcloud)
library(fpc)
library(cluster)
Get 1,500 most recent tweets mentioning ‘Lexus’:
api_key <- "bx7DuojzmTev6Lc5lsNlWWZbo"
api_secret <- "znHaM2gzKeyxJ7BlBdlQXNcaSx2wH5w4UL8zxiRQP8NPVgD5tP"
access_token <- "4202882661-wDHHnAeNZ71MEiQoHzxzBeBEB3eBIHG6pEA1NNF"
access_token_secret <-"CdJXPyjezwG21DchnkjyP8OrEDuTeKmm4Zyp94GFyJ0JJ"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
Lexus = searchTwitter('@Lexus', n=1500)
head(Lexus)
## [[1]]
## [1] "TheNEStudios: .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1"
##
## [[2]]
## [1] "ojas: Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo"
##
## [[3]]
## [1] "RBC_Satelital: (VIDEO) @Lexus presenta el sedán del futuro: el LF-FC ► https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb"
##
## [[4]]
## [1] "CraveOnline: #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3"
##
## [[5]]
## [1] "carno93rally: RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa"
##
## [[6]]
## [1] "MishawakaLexus: RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/…"
Lexus.text = laply(Lexus, function(t) t$getText())
Lexus.text <- iconv(Lexus.text, "ASCII", "UTF-8", sub="")
head(Lexus.text)
## [1] ".@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1"
## [2] "Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo"
## [3] "(VIDEO) @Lexus presenta el sedn del futuro: el LF-FC https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb"
## [4] "#NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3"
## [5] "RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa"
## [6] "RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/"
Similary, get 1,500 most recent tweets mentioning some other car brands:
Benz = searchTwitter('@Mercedes-Benz', n=1500)
Benz.text = laply(Benz, function(t) t$getText())
Benz.text <- iconv(Benz.text, "ASCII", "UTF-8", sub="")
BMW = searchTwitter('@BMW', n=1500)
BMW.text = laply(BMW, function(t) t$getText())
BMW.text <- iconv(BMW.text, "ASCII", "UTF-8", sub="")
Toyota = searchTwitter('@Toyota', n=1500)
Toyota.text = laply(Toyota, function(t) t$getText())
Toyota.text <- iconv(Toyota.text, "ASCII", "UTF-8", sub="")
Hyundai = searchTwitter('@Hyundai', n=1500)
Hyundai.text = laply(Hyundai, function(t) t$getText())
Hyundai.text <- iconv(Hyundai.text, "ASCII", "UTF-8", sub="")
Ford = searchTwitter('@Ford', n=1500)
Ford.text = laply(Ford, function(t) t$getText())
Ford.text <- iconv(Ford.text, "ASCII", "UTF-8", sub="")
Nissan = searchTwitter('@Nissan', n=1500)
Nissan.text = laply(Nissan, function(t) t$getText())
Nissan.text <- iconv(Nissan.text, "ASCII", "UTF-8", sub="")
Audi = searchTwitter('@Audi', n=1500)
Audi.text = laply(Audi, function(t) t$getText())
Audi.text <- iconv(Audi.text, "ASCII", "UTF-8", sub="")
Download the positive and negative words from [https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html] for sentiment analysis.
pos = scan('/Users/youluqi/webdata/positive-words.txt',what = 'character', comment.char = ';')
neg = scan('/Users/youluqi/webdata/negative-words.txt',what = 'character', comment.char = ';')
Score the tweets of each brand, add new columns to identify the car brand for combining all the scores later.
source('sentiment.r')
Lexus.score = score.sentiment(Lexus.text,pos,neg, .progress = 'text')
Lexus.score$brand = 'Lexus'
head(Lexus.score)
## score
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 2
## text
## 1 .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1
## 2 Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo
## 3 (VIDEO) @Lexus presenta el sedn del futuro: el LF-FC https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb
## 4 #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3
## 5 RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa
## 6 RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/
## brand
## 1 Lexus
## 2 Lexus
## 3 Lexus
## 4 Lexus
## 5 Lexus
## 6 Lexus
hist(Lexus.score$score)
Benz.score = score.sentiment(Benz.text,pos,neg, .progress = 'text')
Benz.score$brand = 'Benz'
BMW.score = score.sentiment(BMW.text,pos,neg, .progress = 'text')
BMW.score$brand = 'BMW'
Toyota.score = score.sentiment(Toyota.text,pos,neg, .progress = 'text')
Toyota.score$brand = 'Toyota'
Hyundai.score = score.sentiment(Hyundai.text,pos,neg, .progress = 'text')
Hyundai.score$brand = 'Hyundai'
Ford.score = score.sentiment(Ford.text,pos,neg, .progress = 'text')
Ford.score$brand = 'Ford'
Nissan.score = score.sentiment(Nissan.text,pos,neg, .progress = 'text')
Nissan.score$brand = 'Nissan'
Audi.score = score.sentiment(Audi.text,pos,neg, .progress = 'text')
Audi.score$brand = 'Audi'
Combine the results in a data frame:
scores = rbind(Lexus.score, Benz.score, BMW.score, Toyota.score, Hyundai.score, Ford.score, Nissan.score, Audi.score)
ggplot(data = scores) +
geom_bar(aes(x = score, fill = brand),binwidth = 1) +
facet_grid(brand~.) +
theme_bw() + scale_color_brewer(palette="Pastel2")
Ignore the tweets that has the score from -1 to 1. Only focus on very positive and very negative tweets:
scores$vpos = as.numeric(scores$score > 1)
scores$vneg = as.numeric(scores$score < -1)
head(scores)
## score
## 1 1
## 2 0
## 3 0
## 4 0
## 5 0
## 6 2
## text
## 1 .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1
## 2 Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo
## 3 (VIDEO) @Lexus presenta el sedn del futuro: el LF-FC https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb
## 4 #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3
## 5 RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa
## 6 RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/
## brand vpos vneg
## 1 Lexus 0 0
## 2 Lexus 0 0
## 3 Lexus 0 0
## 4 Lexus 0 0
## 5 Lexus 0 0
## 6 Lexus 1 0
For each car brand, we use (the number of very positive tweets)/(the number of very positive tweets + the number of very negative tweets) as the final score of each brand.
twitter_score = ddply(scores, 'brand', summarise, pos_count = sum(vpos), neg_count = sum(vneg))
twitter_score$all_count = twitter_score$pos_count + twitter_score$neg_count
twitter_score$final_score = round(100*twitter_score$pos_count/twitter_score$all_count)
twitter_score = arrange(twitter_score, desc(final_score))
Add ACSI score (http://247wallst.com/special-report/2015/08/26/the-best-and-worst-car-brands/) to the score table:
twitter_score$ACSI_score = c(81,78,84,82,79,83,77,82)
twitter_score
## brand pos_count neg_count all_count final_score ACSI_score
## 1 Hyundai 499 27 526 95 81
## 2 Lexus 152 9 161 94 78
## 3 Audi 154 13 167 92 84
## 4 BMW 108 11 119 91 82
## 5 Ford 80 8 88 91 79
## 6 Nissan 127 24 151 84 83
## 7 Benz 27 6 33 82 77
## 8 Toyota 90 22 112 80 82
ggplot(data = twitter_score) +
geom_point( aes(x = final_score, y = ACSI_score, label = brand, color=brand),size = 4) +
geom_smooth(aes(x= final_score, y = final_score, group=1),se=F,method='lm')+
theme_bw()+
geom_text(aes(x= final_score, y= ACSI_score, label=brand),position=position_dodge(width=0.9), vjust=-0.6, size = 4)
In this plot, the points on diagonal line shoud have the same ACSI and twitte predicted score. The points which fall in the upper left side of the line must have a ACSI score higher than twitter predicted score. For Toyota, the ACSI score is slightly higher that predicted scores from twitter. For the rest car brands, the predicted scores are more or less higher that ACSI scores.
Build a corpus. Then conduct text cleaning.
Hyundai.text1 = laply(Hyundai, function(t) t$getText())
Hyundai.Corpus = Corpus(VectorSource(Hyundai.text1))
Hyundai.Corpus <- tm_map(Hyundai.Corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')),mc.cores=1)
Hyundai.Corpus = tm_map(Hyundai.Corpus, content_transformer(tolower))
Hyundai.Corpus = tm_map(Hyundai.Corpus, PlainTextDocument)
Hyundai.Corpus = tm_map(Hyundai.Corpus, removePunctuation, lazy = TRUE)
Hyundai.Corpus = tm_map(Hyundai.Corpus, removeNumbers, lazy = TRUE)
reURL = function(x) gsub("http[[:alnum:]]*","",x)
Hyundai.Corpus = tm_map(Hyundai.Corpus, reURL, lazy = TRUE)
mystopwords = c(stopwords("english"),"mercedes-benz","mercedes","benz","mercedesbenz","almost","always","ags","wasnt")
Hyundai.Corpus = tm_map(Hyundai.Corpus,removeWords,mystopwords,lazy=TRUE)
Hyundai.Corpus.Copy = Hyundai.Corpus
Hyundai.Corpus = tm_map(Hyundai.Corpus,stripWhitespace,lazy = TRUE)
Hyundai.Corpus = tm_map(Hyundai.Corpus, PlainTextDocument)
tdm = TermDocumentMatrix(Hyundai.Corpus)
tdm
## <<TermDocumentMatrix (terms: 2560, documents: 1500)>>
## Non-/sparse entries: 15291/3824709
## Sparsity : 100%
## Maximal term length: 26
## Weighting : term frequency (tf)
m = as.matrix(tdm)
word.freq = sort(rowSums(m),decreasing = T)
wordcloud(words=names(word.freq),freq=word.freq, scale=c(8,.2),min.freq=3,max.words=Inf, random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(words = names(word.freq), freq = word.freq, scale =
## c(8, : hyundai could not be fit on page. It will not be plotted.
Find words that occur at least 30 times:
freqwords = findFreqTerms(tdm, lowfreq = 30)
freqwords
## [1] "advert" "amp" "anyone"
## [4] "babajidefadoju" "becausefootball" "behind"
## [7] "bonuses" "bowl" "brand"
## [10] "car" "chance" "chxta"
## [13] "commercial" "cool" "creativity"
## [16] "debuts" "deserve" "dont"
## [19] "enter" "est" "etisalatja"
## [22] "exciting" "fans" "first"
## [25] "flagship" "funny" "genesis"
## [28] "genesisusa" "get" "hyundai"
## [31] "hyundais" "info" "intent"
## [34] "jan" "live" "look"
## [37] "luxury" "marketing" "naias"
## [40] "naiasdetroit" "new" "nigeria"
## [43] "now" "people" "really"
## [46] "redefined" "reveal" "sending"
## [49] "super" "superbowl" "tco"
## [52] "tcohcfph" "tcoxrzvclz" "tcoypapzdqh"
## [55] "team" "texans" "tickets"
## [58] "watch" "whoever" "will"
## [61] "win" "xmas"
The code below identifies which words are associated with “luxury”:
assocs = findAssocs(tdm,terms = 'luxury',corlimit=0.2)
assocs
## $luxury
## naias genesisusa brand est flagship
## 0.67 0.63 0.58 0.52 0.52
## redefined tcoypapzdqh jan reveal watch
## 0.52 0.52 0.51 0.48 0.48
## new debuts tcohcfph tcoxrzvclz info
## 0.46 0.45 0.45 0.45 0.43
## live amp genesisusas todays tcoypapzvfe
## 0.41 0.39 0.32 0.32 0.31
## debut dont miss exciting genesis
## 0.30 0.30 0.30 0.29 0.28
## intimate tcorfwdrtrts newlycrowned tcosyjuenlk
## 0.26 0.26 0.21 0.21
Remove sparse terms:
tdm2 = removeSparseTerms(tdm, sparse = 0.95)
m2 = as.matrix(tdm2)
m3 = t(m2)
set.seed(123)
# number of clusters
k = 6
kmeansOut = kmeans(m3,k)
# cluster centers
round(kmeansOut$centers,digits = 3)
## advert anyone babajidefadoju becausefootball behind bonuses bowl brand
## 1 0.000 1.00 0.99 0.000 0.990 0.000 0.000 0.000
## 2 0.000 0.00 0.00 0.000 0.000 0.000 0.000 0.301
## 3 0.000 0.00 0.00 0.998 0.000 0.000 0.815 0.000
## 4 0.000 0.00 0.00 0.000 0.000 0.000 0.000 0.620
## 5 1.000 0.00 0.00 0.000 0.000 0.984 0.000 0.000
## 6 0.002 0.01 0.00 0.048 0.002 0.002 0.004 0.006
## chance chxta cool creativity deserve enter etisalatja fans funny
## 1 0.000 0.000 0.990 0.99 0.000 0.000 1 0.000 0.990
## 2 0.000 0.000 0.000 0.00 0.000 0.000 0 0.000 0.000
## 3 1.000 0.000 0.000 0.00 0.000 1.002 0 0.172 0.000
## 4 0.000 0.000 0.000 0.00 0.000 0.000 0 0.000 0.000
## 5 0.000 0.984 0.000 0.00 0.984 0.000 0 0.000 0.000
## 6 0.021 0.008 0.025 0.00 0.002 0.004 0 0.004 0.002
## genesis genesisusa get hyundai intent live luxury marketing naias
## 1 0.000 0.000 0.000 1.000 1 0.000 0.000 0 0.000
## 2 0.098 1.098 0.000 0.988 0 0.520 0.803 0 0.780
## 3 0.000 0.000 0.000 1.002 0 0.000 0.000 0 0.000
## 4 0.832 0.000 0.058 0.905 0 0.029 0.591 0 0.869
## 5 0.000 0.000 0.984 1.000 0 0.000 0.000 1 0.000
## 6 0.054 0.023 0.033 1.029 0 0.008 0.004 0 0.046
## new nigeria now people really sending super superbowl tco team
## 1 0.000 0.000 0.000 0.000 0.000 0.00 0.000 0.000 0.000 0
## 2 0.526 0.000 0.000 0.000 0.000 0.00 0.000 0.000 0.006 0
## 3 0.000 0.000 0.000 0.808 0.000 0.98 0.815 0.208 0.020 0
## 4 0.307 0.000 0.000 0.000 0.007 0.00 0.000 0.000 0.182 0
## 5 0.000 0.992 0.984 0.000 1.000 0.00 0.000 0.000 0.961 1
## 6 0.034 0.000 0.027 0.008 0.000 0.00 0.002 0.013 0.011 0
## tickets watch whoever win xmas
## 1 0.000 0.000 1 0.000 0.000
## 2 0.000 0.387 0 0.000 0.000
## 3 0.176 0.016 0 1.005 0.000
## 4 0.000 0.007 0 0.000 0.000
## 5 0.000 0.000 0 0.000 0.984
## 6 0.006 0.004 0 0.031 0.000
#print clusters
for(i in 1:k){
cat(paste("cluster", i, ": ", sep = ""))
s = sort(kmeansOut$centers[i, ], decreasing = T)
cat(names(s)[1:5],"\n")
}
## cluster1: anyone etisalatja hyundai intent whoever
## cluster2: genesisusa hyundai luxury naias new
## cluster3: win enter hyundai chance becausefootball
## cluster4: hyundai naias genesis brand luxury
## cluster5: advert hyundai marketing really team
## cluster6: hyundai genesis becausefootball naias new
Partitioning (clustering) of the data into k clusters “around medoids”, it is more robust than kmeans.
pamOut = pamk(m3)
k2 = pamOut$nc
k2
## [1] 10
pamOut = pamOut$pamobject
# print cluster medoids
for(i in 1:k2){
cat("cluster",i,": ",colnames(pamOut$medoids)[which(pamOut$medoids[i,]==1)],"\n")
}
## cluster 1 : genesis hyundai
## cluster 2 : hyundai
## cluster 3 : becausefootball chance enter fans hyundai sending superbowl tickets win
## cluster 4 : brand hyundai luxury naias tco
## cluster 5 : brand genesisusa hyundai luxury naias new
## cluster 6 : genesisusa hyundai live luxury naias watch
## cluster 7 : genesisusa hyundai live new
## cluster 8 : becausefootball bowl chance enter hyundai people sending super win
## cluster 9 : anyone babajidefadoju behind cool creativity etisalatja funny hyundai intent whoever
## cluster 10 : advert bonuses chxta deserve get hyundai marketing nigeria now really tco team xmas
Visualize clustering result:
clusplot(pamOut, col.p = pamOut$clustering)