Mining Twitter Data for Customer’s Attitudes towards Car Brands

Install and load the required packages

# the twitteR package makes it very easy to grab data on Twitte.
library(twitteR)
library(plyr)
library(ggplot2)
library(dplyr)
library(tm)
library(SnowballC)
library(wordcloud)
library(fpc)
library(cluster)

Scraping Twitter and collect tweet text

Get 1,500 most recent tweets mentioning ‘Lexus’:

api_key <- "bx7DuojzmTev6Lc5lsNlWWZbo"

api_secret <- "znHaM2gzKeyxJ7BlBdlQXNcaSx2wH5w4UL8zxiRQP8NPVgD5tP"

access_token <- "4202882661-wDHHnAeNZ71MEiQoHzxzBeBEB3eBIHG6pEA1NNF"

access_token_secret <-"CdJXPyjezwG21DchnkjyP8OrEDuTeKmm4Zyp94GFyJ0JJ"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

Lexus = searchTwitter('@Lexus', n=1500)
head(Lexus)

## [[1]]
## [1] "TheNEStudios: .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1"
## 
## [[2]]
## [1] "ojas: Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo"
## 
## [[3]]
## [1] "RBC_Satelital: (VIDEO) @Lexus presenta el sedán del futuro: el LF-FC ► https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb"
## 
## [[4]]
## [1] "CraveOnline: #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3"
## 
## [[5]]
## [1] "carno93rally: RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa"
## 
## [[6]]
## [1] "MishawakaLexus: RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/…"

Lexus.text = laply(Lexus, function(t) t$getText())
Lexus.text <- iconv(Lexus.text, "ASCII", "UTF-8", sub="")
head(Lexus.text)

## [1] ".@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1"  
## [2] "Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo"                                                                          
## [3] "(VIDEO) @Lexus presenta el sedn del futuro: el LF-FC  https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb"                                      
## [4] "#NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3"                                                               
## [5] "RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa"                                                                     
## [6] "RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/"

Similary, get 1,500 most recent tweets mentioning some other car brands:

Benz = searchTwitter('@Mercedes-Benz', n=1500)
Benz.text = laply(Benz, function(t) t$getText())
Benz.text <- iconv(Benz.text, "ASCII", "UTF-8", sub="")

BMW = searchTwitter('@BMW', n=1500)
BMW.text = laply(BMW, function(t) t$getText())
BMW.text <- iconv(BMW.text, "ASCII", "UTF-8", sub="")

Toyota = searchTwitter('@Toyota', n=1500)
Toyota.text = laply(Toyota, function(t) t$getText())
Toyota.text <- iconv(Toyota.text, "ASCII", "UTF-8", sub="")

Hyundai = searchTwitter('@Hyundai', n=1500)
Hyundai.text = laply(Hyundai, function(t) t$getText())
Hyundai.text <- iconv(Hyundai.text, "ASCII", "UTF-8", sub="")

Ford = searchTwitter('@Ford', n=1500)
Ford.text = laply(Ford, function(t) t$getText())
Ford.text <- iconv(Ford.text, "ASCII", "UTF-8", sub="")

Nissan = searchTwitter('@Nissan', n=1500)
Nissan.text = laply(Nissan, function(t) t$getText())
Nissan.text <- iconv(Nissan.text, "ASCII", "UTF-8", sub="")

Audi = searchTwitter('@Audi', n=1500)
Audi.text = laply(Audi, function(t) t$getText())
Audi.text <- iconv(Audi.text, "ASCII", "UTF-8", sub="")

Text Mining & Sentiment Analysis

1. Load sentiment word lists

Download the positive and negative words from [https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html] for sentiment analysis.

pos = scan('/Users/youluqi/webdata/positive-words.txt',what = 'character', comment.char = ';')
neg = scan('/Users/youluqi/webdata/negative-words.txt',what = 'character', comment.char = ';')

2. Score sentiment of each brand

Score the tweets of each brand, add new columns to identify the car brand for combining all the scores later.

source('sentiment.r')

Lexus.score = score.sentiment(Lexus.text,pos,neg, .progress = 'text')
Lexus.score$brand = 'Lexus'

head(Lexus.score)

##   score
## 1     1
## 2     0
## 3     0
## 4     0
## 5     0
## 6     2
##                                                                                                                                          text
## 1   .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1
## 2                                                                           Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo
## 3                                       (VIDEO) @Lexus presenta el sedn del futuro: el LF-FC  https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb
## 4                                                                #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3
## 5                                                                      RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa
## 6 RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/
##   brand
## 1 Lexus
## 2 Lexus
## 3 Lexus
## 4 Lexus
## 5 Lexus
## 6 Lexus

hist(Lexus.score$score)

Benz.score = score.sentiment(Benz.text,pos,neg, .progress = 'text')
Benz.score$brand = 'Benz'
BMW.score = score.sentiment(BMW.text,pos,neg, .progress = 'text')
BMW.score$brand = 'BMW'
Toyota.score = score.sentiment(Toyota.text,pos,neg, .progress = 'text')
Toyota.score$brand = 'Toyota'
Hyundai.score = score.sentiment(Hyundai.text,pos,neg, .progress = 'text')
Hyundai.score$brand = 'Hyundai'
Ford.score = score.sentiment(Ford.text,pos,neg, .progress = 'text')
Ford.score$brand = 'Ford'
Nissan.score = score.sentiment(Nissan.text,pos,neg, .progress = 'text')
Nissan.score$brand = 'Nissan'
Audi.score = score.sentiment(Audi.text,pos,neg, .progress = 'text')
Audi.score$brand = 'Audi'

Combine the results in a data frame:

scores = rbind(Lexus.score, Benz.score, BMW.score, Toyota.score, Hyundai.score, Ford.score, Nissan.score, Audi.score)

3. Visulize the distributions of score from different brands

ggplot(data = scores) +
        geom_bar(aes(x = score, fill = brand),binwidth = 1) +
        facet_grid(brand~.) +
        theme_bw() + scale_color_brewer(palette="Pastel2")

4. Compare sentiment score with ACSI score

Ignore the tweets that has the score from -1 to 1. Only focus on very positive and very negative tweets:

scores$vpos = as.numeric(scores$score > 1)
scores$vneg = as.numeric(scores$score < -1)
head(scores)

##   score
## 1     1
## 2     0
## 3     0
## 4     0
## 5     0
## 6     2
##                                                                                                                                          text
## 1   .@Lexus premiered their LC 500 at the @NAIASDetroit this week! It was an honor to have them shoot at our studios! https://t.co/zZsoO0VrW1
## 2                                                                           Who would have thought? Way to go @Lexus! https://t.co/Du2r6H7PZo
## 3                                       (VIDEO) @Lexus presenta el sedn del futuro: el LF-FC  https://t.co/8uJfFMvNKX https://t.co/NsFLRfcsTb
## 4                                                                #NAIAS2016: Car of the Show? New 2017 @Lexus LC 500: https://t.co/q2XjwhOos3
## 5                                                                      RT @Lexus: Pucker up! #KissAGingerDay #LexusLC https://t.co/5BtIPveHwa
## 6 RT @motoringcomau: Most popular new car at the Detroit motor show? @Lexus LC 500 luxury coupe https://t.co/cbAVTWAVRn @NAIASDetroit https:/
##   brand vpos vneg
## 1 Lexus    0    0
## 2 Lexus    0    0
## 3 Lexus    0    0
## 4 Lexus    0    0
## 5 Lexus    0    0
## 6 Lexus    1    0

For each car brand, we use (the number of very positive tweets)/(the number of very positive tweets + the number of very negative tweets) as the final score of each brand.

twitter_score = ddply(scores, 'brand', summarise, pos_count = sum(vpos), neg_count = sum(vneg))
twitter_score$all_count = twitter_score$pos_count + twitter_score$neg_count
twitter_score$final_score = round(100*twitter_score$pos_count/twitter_score$all_count)
twitter_score = arrange(twitter_score, desc(final_score))

Add ACSI score (http://247wallst.com/special-report/2015/08/26/the-best-and-worst-car-brands/) to the score table:

twitter_score$ACSI_score = c(81,78,84,82,79,83,77,82)
twitter_score

##     brand pos_count neg_count all_count final_score ACSI_score
## 1 Hyundai       499        27       526          95         81
## 2   Lexus       152         9       161          94         78
## 3    Audi       154        13       167          92         84
## 4     BMW       108        11       119          91         82
## 5    Ford        80         8        88          91         79
## 6  Nissan       127        24       151          84         83
## 7    Benz        27         6        33          82         77
## 8  Toyota        90        22       112          80         82

ggplot(data = twitter_score) +
        geom_point( aes(x = final_score, y = ACSI_score, label = brand, color=brand),size = 4) +
        geom_smooth(aes(x= final_score, y = final_score, group=1),se=F,method='lm')+
        theme_bw()+
        geom_text(aes(x= final_score, y= ACSI_score, label=brand),position=position_dodge(width=0.9), vjust=-0.6, size = 4)

5. Conclusion

In this plot, the points on diagonal line shoud have the same ACSI and twitte predicted score. The points which fall in the upper left side of the line must have a ACSI score higher than twitter predicted score. For Toyota, the ACSI score is slightly higher that predicted scores from twitter. For the rest car brands, the predicted scores are more or less higher that ACSI scores.

Since the Hyundai gets the highest sentiment score, I am going to analyze the Hyundai tweets deeper:

Text Cleaning

Build a corpus. Then conduct text cleaning.

Hyundai.text1 = laply(Hyundai, function(t) t$getText())

Hyundai.Corpus = Corpus(VectorSource(Hyundai.text1))
Hyundai.Corpus <- tm_map(Hyundai.Corpus, content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')),mc.cores=1)
Hyundai.Corpus = tm_map(Hyundai.Corpus, content_transformer(tolower))
Hyundai.Corpus = tm_map(Hyundai.Corpus, PlainTextDocument)
Hyundai.Corpus = tm_map(Hyundai.Corpus, removePunctuation, lazy = TRUE)
Hyundai.Corpus = tm_map(Hyundai.Corpus, removeNumbers, lazy = TRUE)
reURL = function(x) gsub("http[[:alnum:]]*","",x)
Hyundai.Corpus = tm_map(Hyundai.Corpus, reURL, lazy = TRUE)
mystopwords = c(stopwords("english"),"mercedes-benz","mercedes","benz","mercedesbenz","almost","always","ags","wasnt")
Hyundai.Corpus = tm_map(Hyundai.Corpus,removeWords,mystopwords,lazy=TRUE)
Hyundai.Corpus.Copy = Hyundai.Corpus
Hyundai.Corpus = tm_map(Hyundai.Corpus,stripWhitespace,lazy = TRUE)
Hyundai.Corpus = tm_map(Hyundai.Corpus, PlainTextDocument)

Term-Document Matrix

tdm = TermDocumentMatrix(Hyundai.Corpus)
tdm

## <<TermDocumentMatrix (terms: 2560, documents: 1500)>>
## Non-/sparse entries: 15291/3824709
## Sparsity           : 100%
## Maximal term length: 26
## Weighting          : term frequency (tf)

Generate Word Cloud

m = as.matrix(tdm)
word.freq = sort(rowSums(m),decreasing = T)
wordcloud(words=names(word.freq),freq=word.freq, scale=c(8,.2),min.freq=3,max.words=Inf, random.order=FALSE, rot.per=.15, colors=brewer.pal(8, "Dark2"))

## Warning in wordcloud(words = names(word.freq), freq = word.freq, scale =
## c(8, : hyundai could not be fit on page. It will not be plotted.

From this word cloud, we can clearly see that the most important reason hyundai gained a lot of exposition and good reputation is football! Since hyundai is one of the biggest super bowl advertisers, people prefer to talk about them together with positive words. It seems the feasure of the car did not contribute to the high sentiment score. The result also stressed the importance of commercials.

Explore frequent terms and their associations

Find words that occur at least 30 times:

freqwords = findFreqTerms(tdm, lowfreq = 30)
freqwords

##  [1] "advert"          "amp"             "anyone"         
##  [4] "babajidefadoju"  "becausefootball" "behind"         
##  [7] "bonuses"         "bowl"            "brand"          
## [10] "car"             "chance"          "chxta"          
## [13] "commercial"      "cool"            "creativity"     
## [16] "debuts"          "deserve"         "dont"           
## [19] "enter"           "est"             "etisalatja"     
## [22] "exciting"        "fans"            "first"          
## [25] "flagship"        "funny"           "genesis"        
## [28] "genesisusa"      "get"             "hyundai"        
## [31] "hyundais"        "info"            "intent"         
## [34] "jan"             "live"            "look"           
## [37] "luxury"          "marketing"       "naias"          
## [40] "naiasdetroit"    "new"             "nigeria"        
## [43] "now"             "people"          "really"         
## [46] "redefined"       "reveal"          "sending"        
## [49] "super"           "superbowl"       "tco"            
## [52] "tcohcfph"        "tcoxrzvclz"      "tcoypapzdqh"    
## [55] "team"            "texans"          "tickets"        
## [58] "watch"           "whoever"         "will"           
## [61] "win"             "xmas"

The code below identifies which words are associated with “luxury”:

assocs = findAssocs(tdm,terms = 'luxury',corlimit=0.2)
assocs

## $luxury
##        naias   genesisusa        brand          est     flagship 
##         0.67         0.63         0.58         0.52         0.52 
##    redefined  tcoypapzdqh          jan       reveal        watch 
##         0.52         0.52         0.51         0.48         0.48 
##          new       debuts     tcohcfph   tcoxrzvclz         info 
##         0.46         0.45         0.45         0.45         0.43 
##         live          amp  genesisusas       todays  tcoypapzvfe 
##         0.41         0.39         0.32         0.32         0.31 
##        debut         dont         miss     exciting      genesis 
##         0.30         0.30         0.30         0.29         0.28 
##     intimate tcorfwdrtrts newlycrowned  tcosyjuenlk 
##         0.26         0.26         0.21         0.21

Remove sparse terms:

tdm2 = removeSparseTerms(tdm, sparse = 0.95)
m2 = as.matrix(tdm2)
m3 = t(m2)
set.seed(123)

K-mean clustering

# number of clusters
k = 6
kmeansOut = kmeans(m3,k)
# cluster centers
round(kmeansOut$centers,digits = 3)

##   advert anyone babajidefadoju becausefootball behind bonuses  bowl brand
## 1  0.000   1.00           0.99           0.000  0.990   0.000 0.000 0.000
## 2  0.000   0.00           0.00           0.000  0.000   0.000 0.000 0.301
## 3  0.000   0.00           0.00           0.998  0.000   0.000 0.815 0.000
## 4  0.000   0.00           0.00           0.000  0.000   0.000 0.000 0.620
## 5  1.000   0.00           0.00           0.000  0.000   0.984 0.000 0.000
## 6  0.002   0.01           0.00           0.048  0.002   0.002 0.004 0.006
##   chance chxta  cool creativity deserve enter etisalatja  fans funny
## 1  0.000 0.000 0.990       0.99   0.000 0.000          1 0.000 0.990
## 2  0.000 0.000 0.000       0.00   0.000 0.000          0 0.000 0.000
## 3  1.000 0.000 0.000       0.00   0.000 1.002          0 0.172 0.000
## 4  0.000 0.000 0.000       0.00   0.000 0.000          0 0.000 0.000
## 5  0.000 0.984 0.000       0.00   0.984 0.000          0 0.000 0.000
## 6  0.021 0.008 0.025       0.00   0.002 0.004          0 0.004 0.002
##   genesis genesisusa   get hyundai intent  live luxury marketing naias
## 1   0.000      0.000 0.000   1.000      1 0.000  0.000         0 0.000
## 2   0.098      1.098 0.000   0.988      0 0.520  0.803         0 0.780
## 3   0.000      0.000 0.000   1.002      0 0.000  0.000         0 0.000
## 4   0.832      0.000 0.058   0.905      0 0.029  0.591         0 0.869
## 5   0.000      0.000 0.984   1.000      0 0.000  0.000         1 0.000
## 6   0.054      0.023 0.033   1.029      0 0.008  0.004         0 0.046
##     new nigeria   now people really sending super superbowl   tco team
## 1 0.000   0.000 0.000  0.000  0.000    0.00 0.000     0.000 0.000    0
## 2 0.526   0.000 0.000  0.000  0.000    0.00 0.000     0.000 0.006    0
## 3 0.000   0.000 0.000  0.808  0.000    0.98 0.815     0.208 0.020    0
## 4 0.307   0.000 0.000  0.000  0.007    0.00 0.000     0.000 0.182    0
## 5 0.000   0.992 0.984  0.000  1.000    0.00 0.000     0.000 0.961    1
## 6 0.034   0.000 0.027  0.008  0.000    0.00 0.002     0.013 0.011    0
##   tickets watch whoever   win  xmas
## 1   0.000 0.000       1 0.000 0.000
## 2   0.000 0.387       0 0.000 0.000
## 3   0.176 0.016       0 1.005 0.000
## 4   0.000 0.007       0 0.000 0.000
## 5   0.000 0.000       0 0.000 0.984
## 6   0.006 0.004       0 0.031 0.000

#print clusters
for(i in 1:k){
        cat(paste("cluster", i, ": ", sep = ""))
        s = sort(kmeansOut$centers[i, ], decreasing = T)
        cat(names(s)[1:5],"\n")
}

## cluster1: anyone etisalatja hyundai intent whoever 
## cluster2: genesisusa hyundai luxury naias new 
## cluster3: win enter hyundai chance becausefootball 
## cluster4: hyundai naias genesis brand luxury 
## cluster5: advert hyundai marketing really team 
## cluster6: hyundai genesis becausefootball naias new

K-medoids clustering

Partitioning (clustering) of the data into k clusters “around medoids”, it is more robust than kmeans.

pamOut = pamk(m3)
k2 = pamOut$nc
k2

## [1] 10

pamOut = pamOut$pamobject

# print cluster medoids
for(i in 1:k2){
        cat("cluster",i,": ",colnames(pamOut$medoids)[which(pamOut$medoids[i,]==1)],"\n")
}

## cluster 1 :  genesis hyundai 
## cluster 2 :  hyundai 
## cluster 3 :  becausefootball chance enter fans hyundai sending superbowl tickets win 
## cluster 4 :  brand hyundai luxury naias tco 
## cluster 5 :  brand genesisusa hyundai luxury naias new 
## cluster 6 :  genesisusa hyundai live luxury naias watch 
## cluster 7 :  genesisusa hyundai live new 
## cluster 8 :  becausefootball bowl chance enter hyundai people sending super win 
## cluster 9 :  anyone babajidefadoju behind cool creativity etisalatja funny hyundai intent whoever 
## cluster 10 :  advert bonuses chxta deserve get hyundai marketing nigeria now really tco team xmas

Visualize clustering result:

clusplot(pamOut, col.p = pamOut$clustering)