Open Source & Knowledge Sharingpackages_to_use <- c("twitteR", "base64enc", "dplyr", "tm", "stringi", "tidyr",
"ggplot2", "wordcloud2", "stats", "ape", "purrr", "lubridate",
"scales", "tidytext", "stringr", "plotly")
install_load <- function(packages){
to_install <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(to_install)){
install.packages(to_install, repos='http://cran.us.r-project.org',dependencies = TRUE)
}
lapply(packages,library, character.only = TRUE)
}
install_load(packages_to_use)
rm(packages_to_use); rm(install_load)
options(httr_oauth_cache=T)
api_key <-
api_secret <-
access_token <-
access_token_secret <-
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
please search “twitter api” on google to set up this part.
recommended reference: here
data <- searchTwitter("olive garden", n = 2000, lang = "en", locale = "UTF-8")
# transform to data.frame
twitter_dataframe <- twListToDF(data); rm(data)
save(twitter_dataframe, file = "twitter_dataframe.RData")
load("twitter_dataframe.RData")
names(twitter_dataframe)
## [1] "text" "favorited" "favoriteCount" "replyToSN"
## [5] "created" "truncated" "replyToSID" "id"
## [9] "replyToUID" "statusSource" "screenName" "retweetCount"
## [13] "isRetweet" "retweeted" "longitude" "latitude"
head(twitter_dataframe$text,2)
## [1] "I dead ass just chased down a guest at olive garden bc they didn't pay LMAO"
## [2] "Olive Garden is the rich mans Applebees"
table(stri_enc_mark(twitter_dataframe$text))
##
## ASCII UTF-8
## 1402 598
The following is my own resolution of encoding issue. Feel free to search other resolution on google.
Sample <- twitter_dataframe$text
Sample <- iconv(Sample, "UTF-8", "ASCII", sub="?")
textSample <- Corpus(VectorSource(Sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
textSample <- tm_map(textSample,content_transformer(function(x) iconv(enc2utf8(x), sub='byte')),
mc.cores=1)
textSample <- tm_map(textSample, content_transformer(tolower))
textSample <- tm_map(textSample, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
textSample <- tm_map(textSample, toSpace, "@[a-z]+")
#textSample <- tm_map(textSample, toSpace, "[^\\p{L}\\s[']]+")
textSample <- tm_map(textSample, tolower)
textSample <- tm_map(textSample, removeWords, stopwords("en"))
textSample <- tm_map(textSample, removePunctuation)
textSample <- tm_map(textSample, removeNumbers)
textSample <- tm_map(textSample, stripWhitespace)
textSample <- tm_map(textSample, PlainTextDocument)
tdm <- TermDocumentMatrix(textSample); rm(textSample)
inspect(removeSparseTerms(tdm[,1:5],0.7))
## <<TermDocumentMatrix (terms: 2, documents: 5)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 6
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0) character(0) character(0) character(0) character(0)
## garden 1 1 1 1 1
## olive 1 1 1 1 1
tm_freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
head(tm_freq,20)
## olive garden want like breadsticks just
## 2002 1992 188 180 175 133
## family get three years ass know
## 110 87 87 83 71 71
## amp eat salad day going good
## 68 68 66 64 64 64
## grammys today
## 63 61
to_remove <- c("olive","garden")
tm_freq_chosen <- tm_freq[! names(tm_freq) %in% to_remove]
freq_word <- data.frame(word = names(tm_freq_chosen), count = as.numeric(tm_freq_chosen), stringsAsFactors = F)
top_freq_words <- head(freq_word, 50)
ggplotly(ggplot(top_freq_words, aes(x = reorder(word, count), y = count, fill = count)) +
geom_bar(stat = "identity") + scale_fill_gradient(low="lightblue", high="darkblue") +
theme(axis.text.x = element_text(angle = 60,hjust = 1)) + labs(x = ""))
wordcloud2(top_freq_words, size = 0.6,color = 'random-light')
dense_tdm <- removeSparseTerms(tdm, sparse = 0.98)
tm_matrix <- as.matrix(dense_tdm)
tm_distMatrix <- dist(scale(tm_matrix), method = "manhattan")
# scale - normalization
# dist - distence between words
tm_cluster <- hclust(tm_distMatrix, method = "ward.D2")
# (hang = -1) in plot, labels at the same level
plot(tm_cluster, cex = 0.9,
main = "Word Cluster Dendrogram")
rect.hclust(tm_cluster, k =5)
# fan, add colors randomly
plot(as.phylo(tm_cluster), type = "fan", tip.color = hsv(runif(15, 0.65,
0.95), 1, 1, 0.7), edge.color = hsv(runif(10, 0.65, 0.75), 1, 1, 0.7), edge.width = runif(20,
0.5, 3), use.edge.length = TRUE, col = "gray80")
# vector of colors
mypal = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")
# cutting dendrogram in 5 clusters for color
clus5 = cutree(tm_cluster, 5)
# plot
op = par(bg = "#E8DDCB")
# Size reflects miles per gallon
plot(as.phylo(tm_cluster), type = "fan", tip.color = mypal[clus5], label.offset = 1,
cex = 0.9, col = "red")
# load code of A2R function
source("http://addictedtor.free.fr/packages/A2R/lastVersion/R/code.R")
# colored dendrogram
op = par(bg = "#EFEFEF")
A2Rplot(tm_cluster, k = 2, boxes = FALSE, col.up = "gray50", col.down = c("#FF6B6B",
"#4ECDC4", "#556270"))
nrc <- sentiments %>% filter(lexicon == "nrc") %>% select(word, sentiment)
sentiment_analysis <- freq_word %>%
inner_join(nrc, by = "word") %>% group_by(sentiment) %>%
summarize(word_count = sum(count)) %>%
mutate(count = word_count) %>% select(-2) %>%
arrange(desc(count))
ggplot(sentiment_analysis, aes(reorder(sentiment, count) , count,fill = count)) +
geom_bar(stat = "identity",position = "dodge") + coord_flip() + labs(x = "") +
scale_fill_gradient(low="lightblue", high="darkblue")
Thanks for viewing
Any suggestion, please contact me at: puxin.xu@gmail.com