W9-2: Sentiment Analysis

library(dplyr)
library(textdata) 

# AFINN lexicon uses the `value` feature
## Number of words by value
lexicon_afinn() %>% 
  count(value)

## # A tibble: 11 x 2
##    value     n
##  * <dbl> <int>
##  1    -5    16
##  2    -4    43
##  3    -3   264
##  4    -2   966
##  5    -1   309
##  6     0     1
##  7     1   208
##  8     2   448
##  9     3   172
## 10     4    45
## 11     5     5

lexicon_afinn() %>% filter(value==-5)

## # A tibble: 16 x 2
##    word           value
##    <chr>          <dbl>
##  1 bastard           -5
##  2 bastards          -5
##  3 bitch             -5
##  4 bitches           -5
##  5 cock              -5
##  6 cocksucker        -5
##  7 cocksuckers       -5
##  8 cunt              -5
##  9 motherfucker      -5
## 10 motherfucking     -5
## 11 niggas            -5
## 12 nigger            -5
## 13 prick             -5
## 14 slut              -5
## 15 son-of-a-bitch    -5
## 16 twat              -5

# bing lexicon uses the 'sentiment' feature
## Number of words by sentiment
lexicon_bing() %>% 
  group_by(sentiment) %>% 
  summarise(n = n())

## # A tibble: 2 x 2
##   sentiment     n
## * <chr>     <int>
## 1 negative   4782
## 2 positive   2005

lexicon_bing() %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
## * <chr>     <int>
## 1 negative   4782
## 2 positive   2005

# loughran and nrc lexicons specifies different emotions of the sentiment
## Number of words by sentiment
lexicon_loughran() %>% 
  group_by(sentiment) %>% 
  summarise(n = n())

## # A tibble: 6 x 2
##   sentiment        n
## * <chr>        <int>
## 1 constraining   184
## 2 litigious      904
## 3 negative      2355
## 4 positive       354
## 5 superfluous     56
## 6 uncertainty    297

lexicon_nrc() %>% 
  group_by(sentiment) %>% 
  summarise(n = n())

## # A tibble: 10 x 2
##    sentiment        n
##  * <chr>        <int>
##  1 anger         1247
##  2 anticipation   839
##  3 disgust       1058
##  4 fear          1476
##  5 joy            689
##  6 negative      3324
##  7 positive      2312
##  8 sadness       1191
##  9 surprise       534
## 10 trust         1231

lexicon_nrc() %>% 
  filter(word == "hate")

## # A tibble: 5 x 2
##   word  sentiment
##   <chr> <chr>    
## 1 hate  anger    
## 2 hate  disgust  
## 3 hate  fear     
## 4 hate  negative 
## 5 hate  sadness

lexicon_nrc_eil() %>% 
  count(AffectDimension)

## # A tibble: 4 x 2
##   AffectDimension     n
## * <chr>           <int>
## 1 anger            1483
## 2 fear             1765
## 3 joy              1268
## 4 sadness          1298

Let’s create a histogram to show a distribution of word counts with different intensities in each sentiment group

library(ggplot2)
lexicon_nrc_eil() %>% 
  ggplot(aes(x=score)) +
  geom_histogram(color="black", fill="white") +
  facet_wrap(as.factor(AffectDimension) ~ ., ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lexicon_nrc_vad() %>% 
  ggplot(aes(x=Dominance)) +
  geom_histogram(color="black", fill="white")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Basic Lexicon-based Sentiment Analysis with `inner_join()`

The lexicon-based sentiment analysis can be performed using our tweet data in a tidy format. That is, our tweet data are in a tidy format that each row has a single word from each tweet.

library(tidytext)
library(stringr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(stopwords)
load("covid19_tweets_df.RData")

covid19_tweets_df

## # A tibble: 1,012,305 x 6
##    user_id   status_id   created_at          screen_name text           name    
##    <chr>     <chr>       <dttm>              <chr>       <chr>          <chr>   
##  1 408707568 1243394454~ 2020-03-27 04:28:33 KathleenBu~ "Fascinating ~ Kathlee~
##  2 145492546 1243394454~ 2020-03-27 04:28:33 PetuniaV    "https://t.co~ PurpleP~
##  3 399450399 1243394453~ 2020-03-27 04:28:33 meghanttuc~ "If our Gover~ Meghan ~
##  4 86343213~ 1243394453~ 2020-03-27 04:28:33 drchristin~ "@jmj https:/~ Christi~
##  5 86343213~ 1243392682~ 2020-03-27 04:21:31 drchristin~ "happening as~ Christi~
##  6 86343213~ 1243394067~ 2020-03-27 04:27:01 drchristin~ "@stevejang f~ Christi~
##  7 81368775  1243394453~ 2020-03-27 04:28:33 Zachsnapwe~ "What's even ~ Blackwe~
##  8 12415972~ 1243394452~ 2020-03-27 04:28:33 CustomizeM~ "Myth and Fac~ Customi~
##  9 229334332 1243394452~ 2020-03-27 04:28:33 acentodiar~ "Post-COVID-1~ acento.~
## 10 956452855 1243394451~ 2020-03-27 04:28:32 AirSwerve   "she, along w~ AIR SWE~
## # ... with 1,012,295 more rows

covid19_tweets_tidy <- covid19_tweets_df %>% 
  select(created_at, text) %>% 
  filter(!duplicated(text)) %>% 
  mutate(date = floor_date(created_at, unit="day")) %>% 
  mutate(text = str_replace_all(text, "[#@]?[^[:ascii:]]+", " ")) %>% 
  mutate(text = str_replace_all(text, "&amp;|&lt;|&gt;|&quot;|RT", " ")) %>% 
  unnest_tweets(word, text) %>% 
  filter(!word %in% stopwords()) %>% 
  filter(str_detect(word, "[a-z]"))

## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.

covid19_tweets_tidy

## # A tibble: 16,135,045 x 3
##    created_at          date                word       
##    <dttm>              <dttm>              <chr>      
##  1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating
##  2 2020-03-27 04:28:33 2020-03-27 00:00:00 news       
##  3 2020-03-27 04:28:33 2020-03-27 00:00:00 england    
##  4 2020-03-27 04:28:33 2020-03-27 00:00:00 uk         
##  5 2020-03-27 04:28:33 2020-03-27 00:00:00 firms      
##  6 2020-03-27 04:28:33 2020-03-27 00:00:00 academics  
##  7 2020-03-27 04:28:33 2020-03-27 00:00:00 also       
##  8 2020-03-27 04:28:33 2020-03-27 00:00:00 developed  
##  9 2020-03-27 04:28:33 2020-03-27 00:00:00 selftest   
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 kits       
## # ... with 16,135,035 more rows

covid19_tweets_tidy %>% count(word, sort=T)

## # A tibble: 1,393,892 x 2
##    word              n
##    <chr>         <int>
##  1 covid19      395194
##  2 #covid19     325356
##  3 #coronavirus 208448
##  4 people        90593
##  5 s             84213
##  6 can           81292
##  7 us            80525
##  8 cases         78857
##  9 now           75707
## 10 #covid2019    67658
## # ... with 1,393,882 more rows

covid19_tweets_tidy <- covid19_tweets_tidy %>% 
  filter(str_length(word) > 1)

Understanding `inner_join()`

With data in the tidy format, sentiment analysis can be done as an inner join. When a tidy data b is joined to a tidy data a using a %>% inner_join(b), this returns all rows from a where there are matching values in b, and all columns from a and b.

library(tibble)
text <- tibble(word = c("holiday","makes","me","happy","but","this","song","is","sad"))
text

## # A tibble: 9 x 1
##   word   
##   <chr>  
## 1 holiday
## 2 makes  
## 3 me     
## 4 happy  
## 5 but    
## 6 this   
## 7 song   
## 8 is     
## 9 sad

lexicon <- tibble(word = c("happy","sad","holiday","funeral"), 
                      sentiment = c("positive","negative","positive","negative"))
lexicon

## # A tibble: 4 x 2
##   word    sentiment
##   <chr>   <chr>    
## 1 happy   positive 
## 2 sad     negative 
## 3 holiday positive 
## 4 funeral negative

inner_join(text, lexicon)

## Joining, by = "word"

## # A tibble: 3 x 2
##   word    sentiment
##   <chr>   <chr>    
## 1 holiday positive 
## 2 happy   positive 
## 3 sad     negative

Let’s look at the words with positive and negative sentiment from the bing lexicon. What are the most common negative words in tweets on COVID-19? We can use count() from the dplyr package.

#Using the Bing lexicon, we can select the words in covid19_tweets_tidy that are only annotated to convey sentiments.
covid19_tweets_tidy

## # A tibble: 15,941,162 x 3
##    created_at          date                word       
##    <dttm>              <dttm>              <chr>      
##  1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating
##  2 2020-03-27 04:28:33 2020-03-27 00:00:00 news       
##  3 2020-03-27 04:28:33 2020-03-27 00:00:00 england    
##  4 2020-03-27 04:28:33 2020-03-27 00:00:00 uk         
##  5 2020-03-27 04:28:33 2020-03-27 00:00:00 firms      
##  6 2020-03-27 04:28:33 2020-03-27 00:00:00 academics  
##  7 2020-03-27 04:28:33 2020-03-27 00:00:00 also       
##  8 2020-03-27 04:28:33 2020-03-27 00:00:00 developed  
##  9 2020-03-27 04:28:33 2020-03-27 00:00:00 selftest   
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 kits       
## # ... with 15,941,152 more rows

covid19_tweets_tidy %>% 
  inner_join(lexicon_bing())

## Joining, by = "word"

## # A tibble: 1,574,521 x 4
##    created_at          date                word        sentiment
##    <dttm>              <dttm>              <chr>       <chr>    
##  1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating positive 
##  2 2020-03-27 04:28:33 2020-03-27 00:00:00 available   positive 
##  3 2020-03-27 04:28:33 2020-03-27 00:00:00 virus       negative 
##  4 2020-03-27 04:28:33 2020-03-27 00:00:00 hard        negative 
##  5 2020-03-27 04:28:33 2020-03-27 00:00:00 fucking     negative 
##  6 2020-03-27 04:28:33 2020-03-27 00:00:00 like        positive 
##  7 2020-03-27 04:28:33 2020-03-27 00:00:00 shit        negative 
##  8 2020-03-27 04:27:01 2020-03-27 00:00:00 support     positive 
##  9 2020-03-27 04:27:01 2020-03-27 00:00:00 like        positive 
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 myth        negative 
## # ... with 1,574,511 more rows

# We can count the usage frequency of positive and negative words in tweets on COVID-19
covid19_tweets_tidy %>% 
  inner_join(lexicon_bing()) %>% 
  count(sentiment, sort=T)

## Joining, by = "word"

## # A tibble: 2 x 2
##   sentiment      n
##   <chr>      <int>
## 1 negative  807340
## 2 positive  767181

# Or we can count the frequency of 'fear' words in tweets on COVID-19
covid19_tweets_tidy %>% 
  inner_join(lexicon_nrc()) %>% 
  filter(sentiment == "fear") %>% 
  count(word, sort=T)

## Joining, by = "word"

## # A tibble: 1,427 x 2
##    word           n
##    <chr>      <int>
##  1 pandemic   54287
##  2 fight      22013
##  3 government 21564
##  4 death      18399
##  5 medical    17978
##  6 hospital   16391
##  7 case       12735
##  8 emergency  12519
##  9 risk       11825
## 10 watch      11405
## # ... with 1,417 more rows

# We can also summarise different emotions 
covid19_tweets_tidy %>% 
  inner_join(lexicon_nrc()) %>% 
  group_by(sentiment) %>% 
  summarise(freq = n()) %>% 
  arrange(desc(freq))

## Joining, by = "word"

## # A tibble: 10 x 2
##    sentiment       freq
##    <chr>          <int>
##  1 positive     1214572
##  2 negative      899418
##  3 trust         799147
##  4 fear          619911
##  5 anticipation  569580
##  6 sadness       448993
##  7 joy           383489
##  8 anger         333921
##  9 surprise      261166
## 10 disgust       229475

covid19_tweets_tidy %>% 
  inner_join(lexicon_nrc()) %>% 
  count(sentiment, sort=T)

## Joining, by = "word"

## # A tibble: 10 x 2
##    sentiment          n
##    <chr>          <int>
##  1 positive     1214572
##  2 negative      899418
##  3 trust         799147
##  4 fear          619911
##  5 anticipation  569580
##  6 sadness       448993
##  7 joy           383489
##  8 anger         333921
##  9 surprise      261166
## 10 disgust       229475

Visualizing the result of sentiment analysis

library(ggplot2)

# Bar chart
covid19_tweets_tidy %>% 
  inner_join(lexicon_nrc()) %>% 
  count(sentiment, sort=TRUE) %>%
  mutate(sentiment = reorder(sentiment, n)) %>% 
  ggplot(aes(x=sentiment, y=n)) +
  labs(x="Emotion", y="Frequency", title="Bar Chart of Sentiment toward COVID-19") +
  geom_bar(stat="identity", width=.5, fill="tomato3")

## Joining, by = "word"

# Pie chart
covid19_tweets_tidy %>% 
  inner_join(lexicon_bing()) %>% 
  count(sentiment, sort=TRUE) %>%
  mutate(sentiment = reorder(sentiment, n)) %>% 
  ggplot(aes(x="", y=n, fill=factor(sentiment))) +
  geom_bar(width=1, stat="identity") +
  labs(fill="sentiment", x=NULL, y=NULL, title="Pie Chart of Sentiment toward COVID-19") +
  coord_polar(theta="y", start=0) +
  theme_void()

## Joining, by = "word"

covid19_tweets_tidy %>% 
  inner_join(lexicon_bing()) %>% 
  count(sentiment, sort=TRUE) %>% 
  mutate(sentiment = factor(sentiment, levels=c("negative","positive"))) %>% 
  ggplot(aes(x="", y=n, fill=sentiment)) +
  geom_bar(width=1, stat="identity") +
  labs(fill="sentiment", x=NULL, y=NULL, title="Pie Chart of Sentiment toward COVID-19") +
  scale_fill_discrete(name="Sentiment", labels=c("negative: 2009","positive: 2140")) +
  coord_polar(theta="y", start=0) +
  theme_void()

## Joining, by = "word"

We can also visualize the top 20 words for each sentiment in the bing or nrc lexicons:

covid19_tweets_tidy %>% 
   inner_join(lexicon_nrc()) %>% 
   group_by(sentiment) %>% 
   count(word, sort=T) %>% 
   top_n(20) %>% 
   ggplot(aes(reorder(word, n), n, fill=sentiment)) +
   geom_bar(stat="identity", show.legend = FALSE) +
   facet_wrap(~sentiment, scales="free_y", ncol=5) +
   labs(y = "Contribution to sentiment", x = NULL) +
   coord_flip()

## Joining, by = "word"

## Selecting by n

covid19_tweets_tidy %>% 
   inner_join(lexicon_bing()) %>% 
   group_by(sentiment) %>% 
   count(word, sort=T) %>% 
   top_n(20) %>% 
   ggplot(aes(reorder(word, n), n, fill=sentiment)) +
   geom_bar(stat="identity", show.legend = FALSE) +
   facet_wrap(~sentiment, scales="free_y") +
   labs(y = "Contribution to sentiment", x = NULL) +
   coord_flip()

## Joining, by = "word"
## Selecting by n

Visualization of sentiment word clouds

library(wordcloud)

## Loading required package: RColorBrewer

# Positive words
covid19_tweets_tidy %>% 
  inner_join(lexicon_bing()) %>% # Joining with the Bing dataset
  filter(!word %in% c("trump", "like","positive","virus")) %>% # Removing irrelevant words to sentiment in this context
  group_by(sentiment) %>% 
  count(word, sort=T) %>% 
  filter(sentiment=="positive") %>% 
  with(wordcloud(words = word, # The with( ) function applys an expression to a dataset. 
                 freq = n, 
                 max.words = 100, # Maximum numbers of words plotted
                 random.order = FALSE, # Highly frequent words placed in the middle
                 rot.per = 0.2, # Rate of words rotated in plot
                 scale = c(3, 0.3), # Range of words in size
                 colors = brewer.pal(8, "Dark2"))) # Retrieve 8 colors from the list of "Dark2"

## Joining, by = "word"

covid19_tweets_tidy %>% 
  inner_join(lexicon_bing()) %>% 
  filter(!word %in% c("trump", "like","positive","virus")) %>% 
  group_by(sentiment) %>% 
  count(word, sort=T) %>% 
  filter(sentiment=="negative") %>% 
  with(wordcloud(words = word, # The with( ) function applys an expression to a dataset. 
                 freq = n, 
                 max.words = 100, # Maximum numbers of words plotted
                 random.order = FALSE, # Highly frequent words placed in the middle
                 rot.per = 0.2, # Rate of words rotated in plot
                 scale = c(3, 0.3), # Range of words in size
                 colors = brewer.pal(8, "Dark2"))) # Retrieve 8 colors from the list of "Dark2"

## Joining, by = "word"

W9-2: Sentiment Analysis

Shin Lee

4/29/2021

Basic Lexicon-based Sentiment Analysis with `inner_join()`

Understanding `inner_join()`

Visualizing the result of sentiment analysis

We can also visualize the top 20 words for each sentiment in the bing or nrc lexicons:

Visualization of sentiment word clouds

W9-2: Sentiment Analysis

Shin Lee

4/29/2021

Basic Lexicon-based Sentiment Analysis with inner_join()

Understanding inner_join()

Visualizing the result of sentiment analysis

We can also visualize the top 20 words for each sentiment in the bing or nrc lexicons:

Visualization of sentiment word clouds

Basic Lexicon-based Sentiment Analysis with `inner_join()`

Understanding `inner_join()`