require(tidyverse) # sentiment analysis
require(tidytext) # text manipulation
require(rtweet) # Twitter search
require(stringr) # String manipulation
require(plotly) # plotting package
require(SnowballC) # Word Stemming
Search Tweets by “#Sinovac”, “#BioNTech”, and “#Moderna” and then count the numbers of positive and negative terms.
# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
# Search query
search_term <- "#Sinovac"
sinovac <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
search_term <- "#BioNTech"
biontech <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
# Tokenize the terms and Match with the sentiment terms
sinovac_words_sent <- lapply(sinovac$text,function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})
table(unlist(sinovac_words_sent))
##
## negative positive
## 243 234
biontech_words_sent <- lapply(biontech$text,function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})
table(unlist(biontech_words_sent))
##
## negative positive
## 148 276
Redo again. But this time all terms are stemmed.
# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms
sinovac_tb <- table(unlist(lapply(sinovac$text,function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
tweets_words$word <- wordStem(tweets_words$word,"english")
sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})))
biontech_tb <- table(unlist(lapply(biontech$text,function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
tweets_words$word <- wordStem(tweets_words$word,"english")
sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})))
prop.table(sinovac_tb) # prop.table to view the proportion.
##
## negative positive
## 0.3608124 0.6391876
prop.table(biontech_tb)
##
## negative positive
## 0.2713904 0.7286096
# Define functions to calculate total, positive, negative terms, "sentiment score" by location
Find_total_terms <- function(sa_text){
sum(sapply(sa_text, function(z){
tweet_words <- unnest_tokens(data.frame(txt=z),word,txt)
length(tweet_words$word)
}))
}
Find_pos_sentiments <- function(sa_text){
sum(unlist(lapply(sa_text, function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
tweets_words$word <- wordStem(tweets_words$word,"english")
s <- table(sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word])
return(postive=s[2])
})),na.rm=T)
}
Find_neg_sentiments <- function(sa_text){
sum(unlist(lapply(sa_text, function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
tweets_words$word <- wordStem(tweets_words$word,"english")
s <- table(sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word])
return(negative=s[1])
})),na.rm=T)
}
Find_sentimentscore <- function(sa_text){
p_sent <- Find_pos_sentiments(sa_text)
n_sent <- Find_neg_sentiments(sa_text)
score <- (p_sent - n_sent)/(p_sent + n_sent)
return(score=score)
}
# Different candidates
candidates <- c("#vaccine","#Sinovac", "#BioNTech", "#Moderna","#AstraZeneca","#Sputnik","#Sinopharm")
# Build a loop to search each and combine all data to become data.frame all_data
all_data <- c()
for(c in candidates){
st <- search_tweets(c, n=2000, include_rts = FALSE, lang="en")
if (nrow(st)!=0){
st$text <- gsub('[[:punct:]]',' ',st$text) # removing all punctation characters
st$text <- gsub('[^[:alnum:] ]',' ',st$text) # removing non alphanumeric characters
st$text <- tolower(st$text) # in lower case
st$candidate <- c
all_data <- rbind(all_data,st)
}
Sys.sleep(10)
}
# USe aggregate to see number of posts by each
aggregate(text~candidate,all_data,length)
## candidate text
## 1 #AstraZeneca 1994
## 2 #BioNTech 299
## 3 #Moderna 1994
## 4 #Sinopharm 520
## 5 #Sinovac 414
## 6 #Sputnik 286
## 7 #vaccine 1988
# Calcuate total terms, pos/neg terms, sentiment score by each
ad_tlt <- aggregate(text~candidate,all_data,Find_total_terms)
ad_pos <- aggregate(text~candidate,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~candidate,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~candidate,all_data,Find_sentimentscore)
# Convert them into percentage
ad_ssc$sent_p <- 100*(ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- 100*ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- 100*ad_neg$text / ad_tlt$text
# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~sent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p
# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~negsent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p
# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~possent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p
# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "Vaccines"), yaxis = list (title = "Score"))
p
Last, we try using NRC’s sentiment analysis for fear posts.
sentiment_term <- get_sentiments("nrc")
# Use command prop.table to find the %
Find_NRC_sentiments <- function(sa_text){
total <- sum(unlist(lapply(sa_text, function(z){
tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
tweets_words$word <- wordStem(tweets_words$word,"english")
s <- sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
return(fear= sum(s == "fear"))
})),na.rm=T)
return(100*total/Find_total_terms(sa_text))
}
ad_fear <- aggregate(text~candidate,all_data,Find_NRC_sentiments)
p <- plot_ly(ad_fear, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "NRC's Fear Post %", xaxis = list(title = "Vaccines"), yaxis = list (title = "%"))
p