if (!require("rtweet")) install.packages("rtweet", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tidyverse")) install.packages("tidyverse", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("tidytext")) install.packages("tidytext", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("stringr")) install.packages("stringr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("SnowballC")) install.packages("SnowballC", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
library(tidyverse) # sentiment analysis
library(tidytext) # text manuipation
library(rtweet) # Twitter search
library(stringr) # String manuipation
library(plotly) # plotting package
library(SnowballC) # Word Stemming
Search Tweet by Location
# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
# Search query
search_term <- "#guncontrol"
# Search according to locations: Washington and Florida
nra_dc <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en", geocode = lookup_coords("Washington","country:US"))
nra_fl <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en", geocode = lookup_coords("Florida","country:US"))
# Tokenize the terms and Match with the sentiment terms
nra_dc_words <- strsplit(nra_dc$text,' ')
nra_dc_words_sent <- sapply(nra_dc_words,function(z){
sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(nra_dc_words_sent))
##
## negative positive
## 307 280
nra_fl_words <- strsplit(nra_fl$text,' ')
table(unlist(sapply(nra_fl_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% z]})))
##
## negative positive
## 248 226
# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms
table(unlist(sapply(nra_dc_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
##
## negative positive
## 425 611
table(unlist(sapply(nra_fl_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
##
## negative positive
## 347 515
# Define functions to calculate total, positive, negative terms, "sentiment score" by location
Find_total_terms <- function(nra_text){
nra_words <- strsplit(nra_text,' ')
tt <- sum(unlist(sapply(nra_words, length)))
return(tt)
}
Find_pos_sentiments <- function(nra_text){
nra_words <- strsplit(nra_text,' ')
s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
return(positive=s[2])
}
Find_neg_sentiments <- function(nra_text){
nra_words <- strsplit(nra_text,' ')
s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
return(negative=s[1])
}
Find_sentimentscore <- function(nra_text){
nra_words <- strsplit(nra_text,' ')
s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
score <- (s[2]-s[1])/(s[2]+s[1])
return(score=score)
}
# Selected states
state <- c("arizona","california","florida","massachusetts","north carolina","texas","virginia","washington dc")
# Establish a for loop to search each location and combine all data to become data.frame all_data
all_data <- c()
for(state_name in state){
st <- search_tweets(search_term, n=1000, include_rts = FALSE, lang="en", geocode = lookup_coords(state_name,"country:US"))
if (nrow(st)!=0){
st$text <- gsub('[:punct:]',' ',st$text) # removing all punctation
st$text <- gsub('[:graph:]',' ',st$text) # removing all graphical characters
st$text <- tolower(st$text) # in lower case
st$state_name <- state_name
all_data <- rbind(all_data,st)
}
Sys.sleep(10)
}
# USe aggregate to see number of posts by each location
aggregate(text~state_name,all_data,length)
## state_name text
## 1 arizona 901
## 2 california 972
## 3 florida 999
## 4 massachusetts 1000
## 5 north carolina 1000
## 6 texas 966
## 7 virginia 988
## 8 washington dc 771
# Calcuate total terms, pos/neg terms, sentiment score by each location
ad_tlt <- aggregate(text~state_name,all_data,Find_total_terms)
ad_pos <- aggregate(text~state_name,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~state_name,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~state_name,all_data,Find_sentimentscore)
# Convert them into percentage
ad_ssc$sent_p <- (ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- ad_neg$text / ad_tlt$text
# Plot % of Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$sent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)
p <- plot_ly(ad_ssc, x = ~state_name, y = ~sent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))
p
# Plot % of Negative Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$negsent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)
p <- plot_ly(ad_ssc, x = ~state_name, y = ~negsent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))
p
# Plot % of Postive Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$possent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)
p <- plot_ly(ad_ssc, x = ~state_name, y = ~possent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))
p
# Plot Sentiment Score
ad_ssc <- ad_ssc[order(ad_ssc$text,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)
p <- plot_ly(ad_ssc, x = ~state_name, y = ~text, name = ~state_name, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "State"), yaxis = list (title = "Score"))
p