1. Sentiment Analysis

Search Tweet by #metoo and himtoo and count the numbers of positive and negative terms.

# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)

## # A tibble: 6 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faced    negative 
## 2 2-faces    negative 
## 3 a+         positive 
## 4 abnormal   negative 
## 5 abolish    negative 
## 6 abominable negative

# Search query
search_term <- "#metoo"

# Search according to locations: Washington and Florida
metoo <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

search_term <- "#himtoo"

himtoo <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en") 

# Tokenize the terms and Match with the sentiment terms
metoo_words <- strsplit(metoo$text,' ')
metoo_words_sent <- sapply(metoo_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(metoo_words_sent))

## 
## negative positive 
##      290      239

himtoo_words <- strsplit(himtoo$text,' ')
table(unlist(sapply(himtoo_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% z]
})))

## 
## negative positive 
##      157       81

Redo again. But this time all terms are stemmed.

# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms

metoo_tb <- table(unlist(sapply(metoo_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))

himtoo_tb <- table(unlist(sapply(himtoo_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]}
)))

prop.table(metoo_tb)  # Commmand prop.table to view the proporation.

## 
##  negative  positive 
## 0.4564498 0.5435502

prop.table(himtoo_tb)

## 
##  negative  positive 
## 0.5472637 0.4527363

# Define functions to calculate total, positive, negative terms, "sentiment score" by location 
Find_total_terms <- function(metoo_text){
  metoo_words <- strsplit(metoo_text,' ')
  tt <- sum(unlist(sapply(metoo_words, length)))
  return(tt)
}

Find_pos_sentiments <- function(metoo_text){
  metoo_words <- strsplit(metoo_text,' ')
  s <- table(unlist(sapply(metoo_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  return(positive=s[2])
}

Find_neg_sentiments <- function(metoo_text){
  metoo_words <- strsplit(metoo_text,' ')
  s <- table(unlist(sapply(metoo_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  return(negative=s[1])
}

Find_sentimentscore <- function(metoo_text){
  metoo_words <- strsplit(metoo_text,' ')
  s <- table(unlist(sapply(metoo_words,function(z){
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  })))
  score <- (s[2]-s[1])/(s[2]+s[1])
  return(score=score)
}

# Selected countries 
hashtags <- c("#metoo","#yesallwomen","#heforshe","#himtoo","#mentoo","#notmetoo")

# Establish a for loop to search each location and combine all data to become data.frame all_data
all_data <- c()
for(h in hashtags){
  st <- search_tweets(h, n=1000, include_rts = FALSE, lang="en") 
  if (nrow(st)!=0){
    st$text <- gsub('[[:punct:]]',' ',st$text)   # removing all punctation characters
    st$text <- gsub('[^[:alnum:] ]',' ',st$text)   # removing non alphanumeric characters
    st$text <- tolower(st$text)  # in lower case
    st$hashtag <- h
    all_data <- rbind(all_data,st)
  }
  Sys.sleep(10)
}

# USe aggregate to see number of posts by each location
aggregate(text~hashtag,all_data,length)

##        hashtag text
## 1    #heforshe  681
## 2      #himtoo  187
## 3      #mentoo  275
## 4       #metoo  982
## 5    #notmetoo   22
## 6 #yesallwomen   59

# Calcuate total terms, pos/neg terms, sentiment score by each location
ad_tlt <- aggregate(text~hashtag,all_data,Find_total_terms)
ad_pos <- aggregate(text~hashtag,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~hashtag,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~hashtag,all_data,Find_sentimentscore)

# Convert them into percentage
ad_ssc$sent_p <- (ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- ad_neg$text / ad_tlt$text

# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~sent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p

# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~negsent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p

# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~possent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p

# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~text, name = ~hashtag, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Score"))
p

Lasy, we try using NRC’s sentiment analyssi for anger posts.

# Use command prop.table to find the %
Find_NRC_sentiments <- function(metoo_text){
  metoo_words <- strsplit(metoo_text,' ')
  nrc_sentiment <- prop.table(table(unlist(sapply(metoo_words,function(z){ 
    sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
  }))))
  return(nrc_sentiment)
}

ad_anger <- aggregate(text~hashtag,all_data,Find_NRC_sentiments)

p <- plot_ly(ad_anger, x = ~hashtag, y = ~text[,1], name = ~hashtag, type = 'bar')
p <- layout(p, title = "NRC's Anger Post %", xaxis = list(title = "#Hashtag"), yaxis = list (title = "%"))
p

JMSC 6116 Lecture 6: #MeToo vs. #HimToo: Sentiment Analysis

King-wa Fu

March 1, 2019

1. Sentiment Analysis