Open Source & Knowledge Sharing


Environment Preparation

  • load required packages
packages_to_use <- c("twitteR", "base64enc", "dplyr", "tm", "stringi", "tidyr",
                     "ggplot2", "wordcloud2", "stats", "ape", "purrr", "lubridate",
                     "scales", "tidytext", "stringr", "plotly")
install_load <- function(packages){
  to_install <- packages[!(packages %in% installed.packages()[, "Package"])]
  if (length(to_install)){
    install.packages(to_install, repos='http://cran.us.r-project.org',dependencies = TRUE)
  }
  lapply(packages,library, character.only = TRUE)
}
install_load(packages_to_use)
rm(packages_to_use); rm(install_load)
  • set twitter app info
options(httr_oauth_cache=T)
api_key <- 
api_secret <- 
access_token <- 
access_token_secret <- 
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

please search “twitter api” on google to set up this part.

recommended reference: here


Sample: olive garden

Data Preparation

Data Crawling

  • crawl data
data <- searchTwitter("olive garden", n = 2000, lang = "en", locale = "UTF-8")
# transform to data.frame
twitter_dataframe <- twListToDF(data); rm(data)
save(twitter_dataframe, file = "twitter_dataframe.RData")
load("twitter_dataframe.RData")
  • basic view
names(twitter_dataframe)
##  [1] "text"          "favorited"     "favoriteCount" "replyToSN"    
##  [5] "created"       "truncated"     "replyToSID"    "id"           
##  [9] "replyToUID"    "statusSource"  "screenName"    "retweetCount" 
## [13] "isRetweet"     "retweeted"     "longitude"     "latitude"

Text Cleansing

  • Have a glance
head(twitter_dataframe$text,2)
## [1] "I dead ass just chased down a guest at olive garden bc they didn't pay LMAO"
## [2] "Olive Garden is the rich mans Applebees"
  • Check encoding
table(stri_enc_mark(twitter_dataframe$text))
## 
## ASCII UTF-8 
##  1402   598
  • Extract text context

The following is my own resolution of encoding issue. Feel free to search other resolution on google.

Sample <- twitter_dataframe$text
Sample <- iconv(Sample, "UTF-8", "ASCII", sub="?")
textSample <- Corpus(VectorSource(Sample))
  • Clean text & Transform to TermDocumentMatrix
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
textSample <- tm_map(textSample,content_transformer(function(x) iconv(enc2utf8(x), sub='byte')),
            mc.cores=1)
textSample <- tm_map(textSample, content_transformer(tolower))
textSample <- tm_map(textSample, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
textSample <- tm_map(textSample, toSpace, "@[a-z]+")
#textSample <- tm_map(textSample, toSpace, "[^\\p{L}\\s[']]+")
textSample <- tm_map(textSample, tolower)
textSample <- tm_map(textSample, removeWords, stopwords("en"))
textSample <- tm_map(textSample, removePunctuation)
textSample <- tm_map(textSample, removeNumbers)
textSample <- tm_map(textSample, stripWhitespace)
textSample <- tm_map(textSample, PlainTextDocument)
tdm <- TermDocumentMatrix(textSample); rm(textSample)
inspect(removeSparseTerms(tdm[,1:5],0.7))
## <<TermDocumentMatrix (terms: 2, documents: 5)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## 
##         Docs
## Terms    character(0) character(0) character(0) character(0) character(0)
##   garden            1            1            1            1            1
##   olive             1            1            1            1            1

Data Analysis

Analyze high-frequent words

  • Calculate frequency of words
tm_freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
head(tm_freq,20)
##       olive      garden        want        like breadsticks        just 
##        2002        1992         188         180         175         133 
##      family         get       three       years         ass        know 
##         110          87          87          83          71          71 
##         amp         eat       salad         day       going        good 
##          68          68          66          64          64          64 
##     grammys       today 
##          63          61

Plot frequency (focusing on top 50)

to_remove <- c("olive","garden")
tm_freq_chosen <- tm_freq[! names(tm_freq) %in% to_remove]
freq_word <- data.frame(word = names(tm_freq_chosen), count = as.numeric(tm_freq_chosen), stringsAsFactors = F)
top_freq_words <- head(freq_word, 50)
ggplotly(ggplot(top_freq_words, aes(x = reorder(word, count), y = count, fill = count)) + 
    geom_bar(stat = "identity") + scale_fill_gradient(low="lightblue", high="darkblue") +
    theme(axis.text.x = element_text(angle = 60,hjust = 1)) + labs(x = ""))

Plot wordCloud

wordcloud2(top_freq_words, size = 0.6,color = 'random-light')

Text clustering

  • hclustering
dense_tdm <- removeSparseTerms(tdm, sparse = 0.98)
tm_matrix <- as.matrix(dense_tdm)
tm_distMatrix <- dist(scale(tm_matrix), method = "manhattan")
# scale - normalization
# dist - distence between words
tm_cluster <- hclust(tm_distMatrix, method = "ward.D2")
  • plot 1
# (hang = -1) in plot, labels at the same level
plot(tm_cluster, cex = 0.9, 
     main = "Word Cluster Dendrogram")
rect.hclust(tm_cluster, k =5)

  • plot 2
# fan, add colors randomly
plot(as.phylo(tm_cluster), type = "fan", tip.color = hsv(runif(15, 0.65, 
    0.95), 1, 1, 0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20, 
    0.5, 3), use.edge.length = TRUE, col = "gray80")

  • plot 2
# vector of colors
mypal = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")
# cutting dendrogram in 5 clusters for color
clus5 = cutree(tm_cluster, 5)
# plot
op = par(bg = "#E8DDCB")
# Size reflects miles per gallon
plot(as.phylo(tm_cluster), type = "fan", tip.color = mypal[clus5], label.offset = 1, 
    cex = 0.9, col = "red")

  • plot 4
# load code of A2R function
source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(tm_cluster, k = 2, boxes = FALSE, col.up = "gray50", col.down = c("#FF6B6B", 
    "#4ECDC4", "#556270"))

  • more word cluster plot, click here

Sentiment Analysis

nrc <- sentiments %>% filter(lexicon == "nrc") %>% select(word, sentiment)
sentiment_analysis <- freq_word %>% 
  inner_join(nrc, by = "word") %>% group_by(sentiment) %>% 
  summarize(word_count = sum(count)) %>% 
  mutate(count = word_count) %>% select(-2) %>%
  arrange(desc(count))
ggplot(sentiment_analysis, aes(reorder(sentiment, count) , count,fill = count)) + 
  geom_bar(stat = "identity",position = "dodge") + coord_flip() + labs(x = "") + 
  scale_fill_gradient(low="lightblue", high="darkblue")

Result

Olive garden is great! I like it!