if (!require("rtweet")) install.packages("rtweet", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tidyverse")) install.packages("tidyverse", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tidytext")) install.packages("tidytext", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("stringr")) install.packages("stringr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("SnowballC")) install.packages("SnowballC", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(tidyverse) # sentiment analysis
require(tidytext) # text manuipation
require(rtweet) # Twitter search
require(stringr) # String manuipation
require(plotly) # plotting package
require(SnowballC) # Word Stemming
Search Tweet by #metoo and himtoo and count the numbers of positive and negative terms.
# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
# Search query
search_term <- "#metoo"
# Search according to locations: Washington and Florida
metoo <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
search_term <- "#himtoo"
himtoo <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en")
# Tokenize the terms and Match with the sentiment terms
metoo_words <- strsplit(metoo$text,' ')
metoo_words_sent <- sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(metoo_words_sent))
##
## negative positive
## 290 239
himtoo_words <- strsplit(himtoo$text,' ')
table(unlist(sapply(himtoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% z]
})))
##
## negative positive
## 157 81
Redo again. But this time all terms are stemmed.
# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms
metoo_tb <- table(unlist(sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
himtoo_tb <- table(unlist(sapply(himtoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]}
)))
prop.table(metoo_tb) # Commmand prop.table to view the proporation.
##
## negative positive
## 0.4564498 0.5435502
prop.table(himtoo_tb)
##
## negative positive
## 0.5472637 0.4527363
# Define functions to calculate total, positive, negative terms, "sentiment score" by location
Find_total_terms <- function(metoo_text){
metoo_words <- strsplit(metoo_text,' ')
tt <- sum(unlist(sapply(metoo_words, length)))
return(tt)
}
Find_pos_sentiments <- function(metoo_text){
metoo_words <- strsplit(metoo_text,' ')
s <- table(unlist(sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
return(positive=s[2])
}
Find_neg_sentiments <- function(metoo_text){
metoo_words <- strsplit(metoo_text,' ')
s <- table(unlist(sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
return(negative=s[1])
}
Find_sentimentscore <- function(metoo_text){
metoo_words <- strsplit(metoo_text,' ')
s <- table(unlist(sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
})))
score <- (s[2]-s[1])/(s[2]+s[1])
return(score=score)
}
# Selected countries
hashtags <- c("#metoo","#yesallwomen","#heforshe","#himtoo","#mentoo","#notmetoo")
# Establish a for loop to search each location and combine all data to become data.frame all_data
all_data <- c()
for(h in hashtags){
st <- search_tweets(h, n=1000, include_rts = FALSE, lang="en")
if (nrow(st)!=0){
st$text <- gsub('[[:punct:]]',' ',st$text) # removing all punctation characters
st$text <- gsub('[^[:alnum:] ]',' ',st$text) # removing non alphanumeric characters
st$text <- tolower(st$text) # in lower case
st$hashtag <- h
all_data <- rbind(all_data,st)
}
Sys.sleep(10)
}
# USe aggregate to see number of posts by each location
aggregate(text~hashtag,all_data,length)
## hashtag text
## 1 #heforshe 681
## 2 #himtoo 187
## 3 #mentoo 275
## 4 #metoo 982
## 5 #notmetoo 22
## 6 #yesallwomen 59
# Calcuate total terms, pos/neg terms, sentiment score by each location
ad_tlt <- aggregate(text~hashtag,all_data,Find_total_terms)
ad_pos <- aggregate(text~hashtag,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~hashtag,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~hashtag,all_data,Find_sentimentscore)
# Convert them into percentage
ad_ssc$sent_p <- (ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- ad_neg$text / ad_tlt$text
# Plot % of Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~sent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p
# Plot % of Negative Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~negsent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p
# Plot % of Postive Sentiment terms
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~possent_p, name = ~hashtag, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Percentage"))
p
# Plot Sentiment Score
p <- plot_ly(ad_ssc, x = ~hashtag, y = ~text, name = ~hashtag, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Score"))
p
Lasy, we try using NRC’s sentiment analyssi for anger posts.
# Use command prop.table to find the %
Find_NRC_sentiments <- function(metoo_text){
metoo_words <- strsplit(metoo_text,' ')
nrc_sentiment <- prop.table(table(unlist(sapply(metoo_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]
}))))
return(nrc_sentiment)
}
ad_anger <- aggregate(text~hashtag,all_data,Find_NRC_sentiments)
p <- plot_ly(ad_anger, x = ~hashtag, y = ~text[,1], name = ~hashtag, type = 'bar')
p <- layout(p, title = "NRC's Anger Post %", xaxis = list(title = "#Hashtag"), yaxis = list (title = "%"))
p