1. Sentiment Analysis

Search Tweet by “Bernie Sanders” and “Joe Biden” and count the numbers of positive and negative terms.

# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)

## # A tibble: 6 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

# Search query
search_term <- "Bernie Sanders"

bernie <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

search_term <- "Joe Biden"

joe <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

# Tokenize the terms and Match with the sentiment terms
bernie_words <- strsplit(bernie$text,' ')
bernie_words_sent <- lapply(bernie_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(bernie_words_sent))

## 
## negative positive 
##      308      338

joe_words <- strsplit(joe$text,' ')
table(unlist(lapply(joe_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% z]
})))

## 
## negative positive 
##      309      321

Redo again. But this time all terms are stemmed.

# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms

bernie_tb <- table(unlist(sapply(bernie_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))

joe_tb <- table(unlist(sapply(joe_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]}
)))

prop.table(bernie_tb)  # Commmand prop.table to view the proporation.

## 
## negative positive 
## 0.375802 0.624198

prop.table(joe_tb)

## 
##  negative  positive 
## 0.4024976 0.5975024

# Define functions to calculate total, positive, negative terms, "sentiment score" by location 
Find_total_terms <- function(sa_text){
  sa_words <- strsplit(sa_text,' ')
  tt <- sum(unlist(lapply(sa_words, length)))
  return(tt)
}

Find_pos_sentiments <- function(sa_text){
  sa_words <- strsplit(sa_text,' ')
  s <- table(unlist(lapply(sa_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  return(positive=s[2])
}

Find_neg_sentiments <- function(sa_text){
  sa_words <- strsplit(sa_text,' ')
  s <- table(unlist(lapply(sa_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  return(negative=s[1])
}

Find_sentimentscore <- function(sa_text){
  sa_words <- strsplit(sa_text,' ')
  s <- table(unlist(lapply(sa_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  score <- (s[2]-s[1])/(s[2]+s[1])
  return(score=score)
}

# Different candidates 
candidates <- c("Joe Biden","Bernie Sanders","Elizabeth Warren","Michael Bloomberg","Pete Buttigieg")

# Build a loop to search each and combine all data to become data.frame all_data
all_data <- c()
for(c in candidates){
  st <- search_tweets(c, n=1000, include_rts = FALSE, lang="en") 
  if (nrow(st)!=0){
    st$text <- gsub('[[:punct:]]',' ',st$text)   # removing all punctation characters
    st$text <- gsub('[^[:alnum:] ]',' ',st$text)   # removing non alphanumeric characters
    st$text <- tolower(st$text)  # in lower case
    st$candidate <- c
    all_data <- rbind(all_data,st)
  }
  Sys.sleep(10)
}

# USe aggregate to see number of posts by each 
aggregate(text~candidate,all_data,length)

##           candidate text
## 1    Bernie Sanders 1000
## 2  Elizabeth Warren 1000
## 3         Joe Biden 1000
## 4 Michael Bloomberg  988
## 5    Pete Buttigieg 1000

# Calcuate total terms, pos/neg terms, sentiment score by each 
ad_tlt <- aggregate(text~candidate,all_data,Find_total_terms)
ad_pos <- aggregate(text~candidate,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~candidate,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~candidate,all_data,Find_sentimentscore)

# Convert them into percentage
ad_ssc$sent_p <- 100*(ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- 100*ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- 100*ad_neg$text / ad_tlt$text

# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~sent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "Candidates"), yaxis = list (title = "Percentage"))
p

# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~negsent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "Candidates"), yaxis = list (title = "Percentage"))
p

# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~candidate, y = ~possent_p, name = ~candidate, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "candidates"), yaxis = list (title = "Percentage"))
p

# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~candidate, y = ~text, name = ~candidate, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "Candidates"), yaxis = list (title = "Score"))
p

Lasy, we try using NRC’s sentiment analysis for anger posts.

sentiment_term <- get_sentiments("nrc")

# Use command prop.table to find the %
Find_NRC_sentiments <- function(sa_text){
  sa_words <- strsplit(sa_text,' ')
  nrc_sentiment <- 100*prop.table(table(unlist(lapply(sa_words,function(z){ 
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  }))))
  return(nrc_sentiment)
}

ad_anger <- aggregate(text~candidate,all_data,Find_NRC_sentiments)

p <- plot_ly(ad_anger, x = ~candidate, y = ~text[,1], name = ~candidate, type = 'bar')
p <- layout(p, title = "NRC's Anger Post %", xaxis = list(title = "Candidates"), yaxis = list (title = "%"))
p

JMSC 6116 Lecture 6: 2020 Democratic Party Presidential Primaries: A Sentiment Analysis

King-wa Fu

March 6, 2020

1. Sentiment Analysis