1. Sentiment Analysis

Search Tweets by “#Sinovac”, “#BioNTech”, and “#Moderna” and then count the numbers of positive and negative terms.

# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)

## # A tibble: 6 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

# Search query
search_term <- "#Sinovac"

sinovac <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

search_term <- "#BioNTech"

biontech <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

# Tokenize the terms and Match with the sentiment terms
sinovac_words_sent <- lapply(sinovac$text,function(z){
  tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
  sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})

table(unlist(sinovac_words_sent))

## 
## negative positive 
##      243      234

biontech_words_sent <- lapply(biontech$text,function(z){
  tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
  sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})

table(unlist(biontech_words_sent))

## 
## negative positive 
##      148      276

Redo again. But this time all terms are stemmed.

# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms

sinovac_tb <- table(unlist(lapply(sinovac$text,function(z){
  tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
  tweets_words$word <- wordStem(tweets_words$word,"english")
  sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})))

biontech_tb <- table(unlist(lapply(biontech$text,function(z){
  tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
  tweets_words$word <- wordStem(tweets_words$word,"english")
  sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
})))

prop.table(sinovac_tb)  # prop.table to view the proportion.

## 
##  negative  positive 
## 0.3608124 0.6391876

prop.table(biontech_tb)

## 
##  negative  positive 
## 0.2713904 0.7286096

# Define functions to calculate total, positive, negative terms, "sentiment score" by location 
Find_total_terms <- function(sa_text){
  sum(sapply(sa_text, function(z){
    tweet_words <- unnest_tokens(data.frame(txt=z),word,txt)
    length(tweet_words$word)
  }))  
}

Find_pos_sentiments <- function(sa_text){
  sum(unlist(lapply(sa_text, function(z){
    tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
    tweets_words$word <- wordStem(tweets_words$word,"english")
    s <- table(sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word])
    return(postive=s[2])
  })),na.rm=T)  
}

Find_neg_sentiments <- function(sa_text){
  sum(unlist(lapply(sa_text, function(z){
    tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
    tweets_words$word <- wordStem(tweets_words$word,"english")
    s <- table(sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word])
    return(negative=s[1])
  })),na.rm=T)  
}

Find_sentimentscore <- function(sa_text){
  p_sent <- Find_pos_sentiments(sa_text)
  n_sent <- Find_neg_sentiments(sa_text)
  score <- (p_sent - n_sent)/(p_sent + n_sent)
  return(score=score)
}

# Different candidates 
candidates <- c("#vaccine","#Sinovac", "#BioNTech", "#Moderna","#AstraZeneca","#Sputnik","#Sinopharm")

# Build a loop to search each and combine all data to become data.frame all_data
all_data <- c()
for(c in candidates){
  st <- search_tweets(c, n=2000, include_rts = FALSE, lang="en") 
  if (nrow(st)!=0){
    st$text <- gsub('[[:punct:]]',' ',st$text)   # removing all punctation characters
    st$text <- gsub('[^[:alnum:] ]',' ',st$text)   # removing non alphanumeric characters
    st$text <- tolower(st$text)  # in lower case
    st$candidate <- c
    all_data <- rbind(all_data,st)
  }
  Sys.sleep(10)
}

# USe aggregate to see number of posts by each 
aggregate(text~candidate,all_data,length)

##      candidate text
## 1 #AstraZeneca 1994
## 2    #BioNTech  299
## 3     #Moderna 1994
## 4   #Sinopharm  520
## 5     #Sinovac  414
## 6     #Sputnik  286
## 7     #vaccine 1988

# Calcuate total terms, pos/neg terms, sentiment score by each 
ad_tlt <- aggregate(text~candidate,all_data,Find_total_terms)
ad_pos <- aggregate(text~candidate,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~candidate,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~candidate,all_data,Find_sentimentscore)

# Convert them into percentage
ad_ssc$sent_p <- 100*(ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- 100*ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- 100*ad_neg$text / ad_tlt$text

# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~sent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p

# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~negsent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p

# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~possent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "Vaccines"), yaxis = list (title = "Percentage"))
p

# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "Vaccines"), yaxis = list (title = "Score"))
p

Last, we try using NRC’s sentiment analysis for fear posts.

sentiment_term <- get_sentiments("nrc")

# Use command prop.table to find the %
Find_NRC_sentiments <- function(sa_text){
  total <- sum(unlist(lapply(sa_text, function(z){
    tweets_words <- unnest_tokens(data.frame(txt=z),word,txt)
    tweets_words$word <- wordStem(tweets_words$word,"english")
    s <- sentiment_term$sentiment[sentiment_term$word %in% tweets_words$word]
    return(fear= sum(s == "fear"))
  })),na.rm=T)  
  return(100*total/Find_total_terms(sa_text))
}

ad_fear <- aggregate(text~candidate,all_data,Find_NRC_sentiments)

p <- plot_ly(ad_fear, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "NRC's Fear Post %", xaxis = list(title = "Vaccines"), yaxis = list (title = "%"))
p

JMSC 6116 Lecture 6: Vaccine Information War: A Sentiment Analysis

King-wa Fu

March 19, 2021

1. Sentiment Analysis