require(tidyverse) # sentiment analysis
require(tidytext) # text manuipation
require(rtweet) # Twitter search
require(stringr) # String manuipation
require(plotly) # plotting package
require(SnowballC) # Word Stemming
Search Tweet by “Bernie Sanders” and “Joe Biden” and count the numbers of positive and negative terms.
# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
# Search query
search_term <- "Bernie Sanders"
bernie <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
search_term <- "Joe Biden"
joe <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
# Tokenize the terms and Match with the sentiment terms
bernie_words <- strsplit(bernie$text,' ')
bernie_words_sent <- lapply(bernie_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(bernie_words_sent))
##
## negative positive
## 308 338
joe_words <- strsplit(joe$text,' ')
table(unlist(lapply(joe_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% z]
})))
##
## negative positive
## 309 321
Redo again. But this time all terms are stemmed.
# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms
bernie_tb <- table(unlist(sapply(bernie_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
joe_tb <- table(unlist(sapply(joe_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]}
)))
prop.table(bernie_tb) # Commmand prop.table to view the proporation.
##
## negative positive
## 0.375802 0.624198
prop.table(joe_tb)
##
## negative positive
## 0.4024976 0.5975024
# Define functions to calculate total, positive, negative terms, "sentiment score" by location
Find_total_terms <- function(sa_text){
sa_words <- strsplit(sa_text,' ')
tt <- sum(unlist(lapply(sa_words, length)))
return(tt)
}
Find_pos_sentiments <- function(sa_text){
sa_words <- strsplit(sa_text,' ')
s <- table(unlist(lapply(sa_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
return(positive=s[2])
}
Find_neg_sentiments <- function(sa_text){
sa_words <- strsplit(sa_text,' ')
s <- table(unlist(lapply(sa_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
return(negative=s[1])
}
Find_sentimentscore <- function(sa_text){
sa_words <- strsplit(sa_text,' ')
s <- table(unlist(lapply(sa_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
score <- (s[2]-s[1])/(s[2]+s[1])
return(score=score)
}
# Different candidates
candidates <- c("Joe Biden","Bernie Sanders","Elizabeth Warren","Michael Bloomberg","Pete Buttigieg")
# Build a loop to search each and combine all data to become data.frame all_data
all_data <- c()
for(c in candidates){
st <- search_tweets(c, n=1000, include_rts = FALSE, lang="en")
if (nrow(st)!=0){
st$text <- gsub('[[:punct:]]',' ',st$text) # removing all punctation characters
st$text <- gsub('[^[:alnum:] ]',' ',st$text) # removing non alphanumeric characters
st$text <- tolower(st$text) # in lower case
st$candidate <- c
all_data <- rbind(all_data,st)
}
Sys.sleep(10)
}
# USe aggregate to see number of posts by each
aggregate(text~candidate,all_data,length)
## candidate text
## 1 Bernie Sanders 1000
## 2 Elizabeth Warren 1000
## 3 Joe Biden 1000
## 4 Michael Bloomberg 988
## 5 Pete Buttigieg 1000
# Calcuate total terms, pos/neg terms, sentiment score by each
ad_tlt <- aggregate(text~candidate,all_data,Find_total_terms)
ad_pos <- aggregate(text~candidate,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~candidate,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~candidate,all_data,Find_sentimentscore)
# Convert them into percentage
ad_ssc$sent_p <- 100*(ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- 100*ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- 100*ad_neg$text / ad_tlt$text
# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~sent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "Candidates"), yaxis = list (title = "Percentage"))
p
# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~negsent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "Candidates"), yaxis = list (title = "Percentage"))
p
# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~possent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "candidates"), yaxis = list (title = "Percentage"))
p
# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "Candidates"), yaxis = list (title = "Score"))
p
Lasy, we try using NRC’s sentiment analysis for anger posts.
sentiment_term <- get_sentiments("nrc")
# Use command prop.table to find the %
Find_NRC_sentiments <- function(sa_text){
sa_words <- strsplit(sa_text,' ')
nrc_sentiment <- 100*prop.table(table(unlist(lapply(sa_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
}))))
return(nrc_sentiment)
}
ad_anger <- aggregate(text~candidate,all_data,Find_NRC_sentiments)
p <- plot_ly(ad_anger, x = ~candidate, y = ~text[,1], name = ~candidate, type = 'bar')
p <- layout(p, title = "NRC's Anger Post %", xaxis = list(title = "Candidates"), yaxis = list (title = "%"))
p