Sample: olive garden

Data Preparation

Data Crawling

crawl data

data <- searchTwitter("olive garden", n = 2000, lang = "en", locale = "UTF-8")
# transform to data.frame
twitter_dataframe <- twListToDF(data); rm(data)
save(twitter_dataframe, file = "twitter_dataframe.RData")

load("twitter_dataframe.RData")

basic view

names(twitter_dataframe)

##  [1] "text"          "favorited"     "favoriteCount" "replyToSN"    
##  [5] "created"       "truncated"     "replyToSID"    "id"           
##  [9] "replyToUID"    "statusSource"  "screenName"    "retweetCount" 
## [13] "isRetweet"     "retweeted"     "longitude"     "latitude"

Text Cleansing

Have a glance

head(twitter_dataframe$text,2)

## [1] "I dead ass just chased down a guest at olive garden bc they didn't pay LMAO"
## [2] "Olive Garden is the rich mans Applebees"

Check encoding

table(stri_enc_mark(twitter_dataframe$text))

## 
## ASCII UTF-8 
##  1402   598

Extract text context

The following is my own resolution of encoding issue. Feel free to search other resolution on google.

Sample <- twitter_dataframe$text
Sample <- iconv(Sample, "UTF-8", "ASCII", sub="?")
textSample <- Corpus(VectorSource(Sample))

Clean text & Transform to TermDocumentMatrix

toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
textSample <- tm_map(textSample,content_transformer(function(x) iconv(enc2utf8(x), sub='byte')),
            mc.cores=1)
textSample <- tm_map(textSample, content_transformer(tolower))
textSample <- tm_map(textSample, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
textSample <- tm_map(textSample, toSpace, "@[a-z]+")
#textSample <- tm_map(textSample, toSpace, "[^\\p{L}\\s[']]+")
textSample <- tm_map(textSample, tolower)
textSample <- tm_map(textSample, removeWords, stopwords("en"))
textSample <- tm_map(textSample, removePunctuation)
textSample <- tm_map(textSample, removeNumbers)
textSample <- tm_map(textSample, stripWhitespace)
textSample <- tm_map(textSample, PlainTextDocument)
tdm <- TermDocumentMatrix(textSample); rm(textSample)
inspect(removeSparseTerms(tdm[,1:5],0.7))

## <<TermDocumentMatrix (terms: 2, documents: 5)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## 
##         Docs
## Terms    character(0) character(0) character(0) character(0) character(0)
##   garden            1            1            1            1            1
##   olive             1            1            1            1            1

Data Analysis

Analyze high-frequent words

Calculate frequency of words

tm_freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
head(tm_freq,20)

##       olive      garden        want        like breadsticks        just 
##        2002        1992         188         180         175         133 
##      family         get       three       years         ass        know 
##         110          87          87          83          71          71 
##         amp         eat       salad         day       going        good 
##          68          68          66          64          64          64 
##     grammys       today 
##          63          61

Plot frequency (focusing on top 50)

to_remove <- c("olive","garden")
tm_freq_chosen <- tm_freq[! names(tm_freq) %in% to_remove]
freq_word <- data.frame(word = names(tm_freq_chosen), count = as.numeric(tm_freq_chosen), stringsAsFactors = F)
top_freq_words <- head(freq_word, 50)
ggplotly(ggplot(top_freq_words, aes(x = reorder(word, count), y = count, fill = count)) + 
    geom_bar(stat = "identity") + scale_fill_gradient(low="lightblue", high="darkblue") +
    theme(axis.text.x = element_text(angle = 60,hjust = 1)) + labs(x = ""))

Plot wordCloud

wordcloud2(top_freq_words, size = 0.6,color = 'random-light')

Text clustering

hclustering

dense_tdm <- removeSparseTerms(tdm, sparse = 0.98)
tm_matrix <- as.matrix(dense_tdm)
tm_distMatrix <- dist(scale(tm_matrix), method = "manhattan")
# scale - normalization
# dist - distence between words
tm_cluster <- hclust(tm_distMatrix, method = "ward.D2")

plot 1

# (hang = -1) in plot, labels at the same level
plot(tm_cluster, cex = 0.9, 
     main = "Word Cluster Dendrogram")
rect.hclust(tm_cluster, k =5)

plot 2

# fan, add colors randomly
plot(as.phylo(tm_cluster), type = "fan", tip.color = hsv(runif(15, 0.65, 
    0.95), 1, 1, 0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20, 
    0.5, 3), use.edge.length = TRUE, col = "gray80")

plot 2

# vector of colors
mypal = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")
# cutting dendrogram in 5 clusters for color
clus5 = cutree(tm_cluster, 5)
# plot
op = par(bg = "#E8DDCB")
# Size reflects miles per gallon
plot(as.phylo(tm_cluster), type = "fan", tip.color = mypal[clus5], label.offset = 1, 
    cex = 0.9, col = "red")

plot 4

# load code of A2R function
source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(tm_cluster, k = 2, boxes = FALSE, col.up = "gray50", col.down = c("#FF6B6B", 
    "#4ECDC4", "#556270"))

more word cluster plot, click here

Sentiment Analysis

nrc <- sentiments %>% filter(lexicon == "nrc") %>% select(word, sentiment)
sentiment_analysis <- freq_word %>% 
  inner_join(nrc, by = "word") %>% group_by(sentiment) %>% 
  summarize(word_count = sum(count)) %>% 
  mutate(count = word_count) %>% select(-2) %>%
  arrange(desc(count))
ggplot(sentiment_analysis, aes(reorder(sentiment, count) , count,fill = count)) + 
  geom_bar(stat = "identity",position = "dodge") + coord_flip() + labs(x = "") + 
  scale_fill_gradient(low="lightblue", high="darkblue")

Result

Olive garden is great! I like it!

Thanks for viewing
Any suggestion, please contact me at: puxin.xu@gmail.com

(OS&KS) Sample of Twitter Data Analysis

Jacob (Puxin)

February 12, 2017

Sample: olive garden

Data Preparation

Data Crawling

Text Cleansing

Data Analysis

Analyze high-frequent words

Plot frequency (focusing on top 50)

Plot wordCloud

Text clustering

Sentiment Analysis

Result

Olive garden is great! I like it!

(OS&KS) Sample of Twitter Data Analysis

Jacob (Puxin)

February 12, 2017

Open Source & Knowledge Sharing

Environment Preparation

Sample: olive garden

Data Preparation

Data Crawling

Text Cleansing

Data Analysis

Analyze high-frequent words

Plot frequency (focusing on top 50)

Plot wordCloud

Text clustering

Sentiment Analysis

Result

Olive garden is great! I like it!

`Open Source & Knowledge Sharing`