1. Sentiment Analysis

Search Tweet by Location

# A list of Sentiment Terms
sentiment_term <- get_sentiments("bing")
head(sentiment_term)

## # A tibble: 6 x 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faced    negative 
## 2 2-faces    negative 
## 3 a+         positive 
## 4 abnormal   negative 
## 5 abolish    negative 
## 6 abominable negative

# Search query
search_term <- "#guncontrol"

# Search according to locations: Washington and Florida
nra_dc <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en", geocode = lookup_coords("Washington","country:US")) 

nra_fl <- search_tweets(search_term, n=500, include_rts = FALSE, lang="en", geocode = lookup_coords("Florida","country:US")) 

# Tokenize the terms and Match with the sentiment terms
nra_dc_words <- strsplit(nra_dc$text,' ')
nra_dc_words_sent <- sapply(nra_dc_words,function(z){
  sentiment_term$sentiment[sentiment_term$word %in% z]
})
table(unlist(nra_dc_words_sent))

## 
## negative positive 
##      307      280

nra_fl_words <- strsplit(nra_fl$text,' ')
table(unlist(sapply(nra_fl_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% z]})))

## 
## negative positive 
##      248      226

# Word Stemming
sentiment_term$word <- wordStem(sentiment_term$word,"english")
sentiment_term <- sentiment_term[!duplicated(sentiment_term$word),] # remove duplicated terms

table(unlist(sapply(nra_dc_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))

## 
## negative positive 
##      425      611

table(unlist(sapply(nra_fl_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))

## 
## negative positive 
##      347      515

# Define functions to calculate total, positive, negative terms, "sentiment score" by location 
Find_total_terms <- function(nra_text){
  nra_words <- strsplit(nra_text,' ')
  tt <- sum(unlist(sapply(nra_words, length)))
  return(tt)
}

Find_pos_sentiments <- function(nra_text){
  nra_words <- strsplit(nra_text,' ')
  s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
  return(positive=s[2])
}

Find_neg_sentiments <- function(nra_text){
  nra_words <- strsplit(nra_text,' ')
  s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
  return(negative=s[1])
}

Find_sentimentscore <- function(nra_text){
  nra_words <- strsplit(nra_text,' ')
  s <- table(unlist(sapply(nra_words, function(z){sentiment_term$sentiment[sentiment_term$word %in% wordStem(z,"english")]})))
  score <- (s[2]-s[1])/(s[2]+s[1])
  return(score=score)
}

# Selected states 
state <- c("arizona","california","florida","massachusetts","north carolina","texas","virginia","washington dc")

# Establish a for loop to search each location and combine all data to become data.frame all_data
all_data <- c()
for(state_name in state){
  st <- search_tweets(search_term, n=1000, include_rts = FALSE, lang="en", geocode = lookup_coords(state_name,"country:US")) 
  if (nrow(st)!=0){
    st$text <- gsub('[:punct:]',' ',st$text)   # removing all punctation
    st$text <- gsub('[:graph:]',' ',st$text)   # removing all graphical characters
    st$text <- tolower(st$text)  # in lower case
    st$state_name <- state_name
    all_data <- rbind(all_data,st)
  }
  Sys.sleep(10)
}

# USe aggregate to see number of posts by each location
aggregate(text~state_name,all_data,length)

##       state_name text
## 1        arizona  901
## 2     california  972
## 3        florida  999
## 4  massachusetts 1000
## 5 north carolina 1000
## 6          texas  966
## 7       virginia  988
## 8  washington dc  771

# Calcuate total terms, pos/neg terms, sentiment score by each location
ad_tlt <- aggregate(text~state_name,all_data,Find_total_terms)
ad_pos <- aggregate(text~state_name,all_data,Find_pos_sentiments)
ad_neg <- aggregate(text~state_name,all_data,Find_neg_sentiments)
ad_ssc <- aggregate(text~state_name,all_data,Find_sentimentscore)

# Convert them into percentage
ad_ssc$sent_p <- (ad_pos$text + ad_neg$text) / ad_tlt$text
ad_ssc$possent_p <- ad_pos$text / ad_tlt$text
ad_ssc$negsent_p <- ad_neg$text / ad_tlt$text

# Plot % of Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$sent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)

p <- plot_ly(ad_ssc, x = ~state_name, y = ~sent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))
p

# Plot % of Negative Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$negsent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)

p <- plot_ly(ad_ssc, x = ~state_name, y = ~negsent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Negative Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))

p

# Plot % of Postive Sentiment terms
ad_ssc <- ad_ssc[order(ad_ssc$possent_p,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)

p <- plot_ly(ad_ssc, x = ~state_name, y = ~possent_p, name = ~state_name, type = 'bar')
p <- layout(p, title = "% of Positive Sentiment Terms", xaxis = list(title = "State"), yaxis = list (title = "Percentage"))

p

# Plot Sentiment Score
ad_ssc <- ad_ssc[order(ad_ssc$text,decreasing=TRUE),]
ad_ssc$state_name <- factor(ad_ssc$state_name, levels = ad_ssc$state_name)

p <- plot_ly(ad_ssc, x = ~state_name, y = ~text, name = ~state_name, type = 'bar')
p <- layout(p, title = "Sentiment Score", xaxis = list(title = "State"), yaxis = list (title = "Score"))

p

JMSC 6116 Lecture 6: Sentiment Analysis

King-wa Fu

February 26, 2018

1. Sentiment Analysis