library(dplyr)
library(textdata)
# AFINN lexicon uses the `value` feature
## Number of words by value
lexicon_afinn() %>%
count(value)
## # A tibble: 11 x 2
## value n
## * <dbl> <int>
## 1 -5 16
## 2 -4 43
## 3 -3 264
## 4 -2 966
## 5 -1 309
## 6 0 1
## 7 1 208
## 8 2 448
## 9 3 172
## 10 4 45
## 11 5 5
lexicon_afinn() %>% filter(value==-5)
## # A tibble: 16 x 2
## word value
## <chr> <dbl>
## 1 bastard -5
## 2 bastards -5
## 3 bitch -5
## 4 bitches -5
## 5 cock -5
## 6 cocksucker -5
## 7 cocksuckers -5
## 8 cunt -5
## 9 motherfucker -5
## 10 motherfucking -5
## 11 niggas -5
## 12 nigger -5
## 13 prick -5
## 14 slut -5
## 15 son-of-a-bitch -5
## 16 twat -5
# bing lexicon uses the 'sentiment' feature
## Number of words by sentiment
lexicon_bing() %>%
group_by(sentiment) %>%
summarise(n = n())
## # A tibble: 2 x 2
## sentiment n
## * <chr> <int>
## 1 negative 4782
## 2 positive 2005
lexicon_bing() %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## * <chr> <int>
## 1 negative 4782
## 2 positive 2005
# loughran and nrc lexicons specifies different emotions of the sentiment
## Number of words by sentiment
lexicon_loughran() %>%
group_by(sentiment) %>%
summarise(n = n())
## # A tibble: 6 x 2
## sentiment n
## * <chr> <int>
## 1 constraining 184
## 2 litigious 904
## 3 negative 2355
## 4 positive 354
## 5 superfluous 56
## 6 uncertainty 297
lexicon_nrc() %>%
group_by(sentiment) %>%
summarise(n = n())
## # A tibble: 10 x 2
## sentiment n
## * <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231
lexicon_nrc() %>%
filter(word == "hate")
## # A tibble: 5 x 2
## word sentiment
## <chr> <chr>
## 1 hate anger
## 2 hate disgust
## 3 hate fear
## 4 hate negative
## 5 hate sadness
lexicon_nrc_eil() %>%
count(AffectDimension)
## # A tibble: 4 x 2
## AffectDimension n
## * <chr> <int>
## 1 anger 1483
## 2 fear 1765
## 3 joy 1268
## 4 sadness 1298
Let’s create a histogram to show a distribution of word counts with different intensities in each sentiment group
library(ggplot2)
lexicon_nrc_eil() %>%
ggplot(aes(x=score)) +
geom_histogram(color="black", fill="white") +
facet_wrap(as.factor(AffectDimension) ~ ., ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
lexicon_nrc_vad() %>%
ggplot(aes(x=Dominance)) +
geom_histogram(color="black", fill="white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
inner_join()
The lexicon-based sentiment analysis can be performed using our tweet data in a tidy format. That is, our tweet data are in a tidy format that each row has a single word from each tweet.
library(tidytext)
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stopwords)
load("covid19_tweets_df.RData")
covid19_tweets_df
## # A tibble: 1,012,305 x 6
## user_id status_id created_at screen_name text name
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 408707568 1243394454~ 2020-03-27 04:28:33 KathleenBu~ "Fascinating ~ Kathlee~
## 2 145492546 1243394454~ 2020-03-27 04:28:33 PetuniaV "https://t.co~ PurpleP~
## 3 399450399 1243394453~ 2020-03-27 04:28:33 meghanttuc~ "If our Gover~ Meghan ~
## 4 86343213~ 1243394453~ 2020-03-27 04:28:33 drchristin~ "@jmj https:/~ Christi~
## 5 86343213~ 1243392682~ 2020-03-27 04:21:31 drchristin~ "happening as~ Christi~
## 6 86343213~ 1243394067~ 2020-03-27 04:27:01 drchristin~ "@stevejang f~ Christi~
## 7 81368775 1243394453~ 2020-03-27 04:28:33 Zachsnapwe~ "What's even ~ Blackwe~
## 8 12415972~ 1243394452~ 2020-03-27 04:28:33 CustomizeM~ "Myth and Fac~ Customi~
## 9 229334332 1243394452~ 2020-03-27 04:28:33 acentodiar~ "Post-COVID-1~ acento.~
## 10 956452855 1243394451~ 2020-03-27 04:28:32 AirSwerve "she, along w~ AIR SWE~
## # ... with 1,012,295 more rows
covid19_tweets_tidy <- covid19_tweets_df %>%
select(created_at, text) %>%
filter(!duplicated(text)) %>%
mutate(date = floor_date(created_at, unit="day")) %>%
mutate(text = str_replace_all(text, "[#@]?[^[:ascii:]]+", " ")) %>%
mutate(text = str_replace_all(text, "&|<|>|"|RT", " ")) %>%
unnest_tweets(word, text) %>%
filter(!word %in% stopwords()) %>%
filter(str_detect(word, "[a-z]"))
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
covid19_tweets_tidy
## # A tibble: 16,135,045 x 3
## created_at date word
## <dttm> <dttm> <chr>
## 1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating
## 2 2020-03-27 04:28:33 2020-03-27 00:00:00 news
## 3 2020-03-27 04:28:33 2020-03-27 00:00:00 england
## 4 2020-03-27 04:28:33 2020-03-27 00:00:00 uk
## 5 2020-03-27 04:28:33 2020-03-27 00:00:00 firms
## 6 2020-03-27 04:28:33 2020-03-27 00:00:00 academics
## 7 2020-03-27 04:28:33 2020-03-27 00:00:00 also
## 8 2020-03-27 04:28:33 2020-03-27 00:00:00 developed
## 9 2020-03-27 04:28:33 2020-03-27 00:00:00 selftest
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 kits
## # ... with 16,135,035 more rows
covid19_tweets_tidy %>% count(word, sort=T)
## # A tibble: 1,393,892 x 2
## word n
## <chr> <int>
## 1 covid19 395194
## 2 #covid19 325356
## 3 #coronavirus 208448
## 4 people 90593
## 5 s 84213
## 6 can 81292
## 7 us 80525
## 8 cases 78857
## 9 now 75707
## 10 #covid2019 67658
## # ... with 1,393,882 more rows
covid19_tweets_tidy <- covid19_tweets_tidy %>%
filter(str_length(word) > 1)
inner_join()
With data in the tidy format, sentiment analysis can be done as an inner join. When a tidy data b
is joined to a tidy data a
using a %>% inner_join(b)
, this returns all rows from a
where there are matching values in b
, and all columns from a
and b
.
library(tibble)
text <- tibble(word = c("holiday","makes","me","happy","but","this","song","is","sad"))
text
## # A tibble: 9 x 1
## word
## <chr>
## 1 holiday
## 2 makes
## 3 me
## 4 happy
## 5 but
## 6 this
## 7 song
## 8 is
## 9 sad
lexicon <- tibble(word = c("happy","sad","holiday","funeral"),
sentiment = c("positive","negative","positive","negative"))
lexicon
## # A tibble: 4 x 2
## word sentiment
## <chr> <chr>
## 1 happy positive
## 2 sad negative
## 3 holiday positive
## 4 funeral negative
inner_join(text, lexicon)
## Joining, by = "word"
## # A tibble: 3 x 2
## word sentiment
## <chr> <chr>
## 1 holiday positive
## 2 happy positive
## 3 sad negative
Let’s look at the words with positive and negative sentiment from the bing lexicon. What are the most common negative words in tweets on COVID-19? We can use count()
from the dplyr package.
#Using the Bing lexicon, we can select the words in covid19_tweets_tidy that are only annotated to convey sentiments.
covid19_tweets_tidy
## # A tibble: 15,941,162 x 3
## created_at date word
## <dttm> <dttm> <chr>
## 1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating
## 2 2020-03-27 04:28:33 2020-03-27 00:00:00 news
## 3 2020-03-27 04:28:33 2020-03-27 00:00:00 england
## 4 2020-03-27 04:28:33 2020-03-27 00:00:00 uk
## 5 2020-03-27 04:28:33 2020-03-27 00:00:00 firms
## 6 2020-03-27 04:28:33 2020-03-27 00:00:00 academics
## 7 2020-03-27 04:28:33 2020-03-27 00:00:00 also
## 8 2020-03-27 04:28:33 2020-03-27 00:00:00 developed
## 9 2020-03-27 04:28:33 2020-03-27 00:00:00 selftest
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 kits
## # ... with 15,941,152 more rows
covid19_tweets_tidy %>%
inner_join(lexicon_bing())
## Joining, by = "word"
## # A tibble: 1,574,521 x 4
## created_at date word sentiment
## <dttm> <dttm> <chr> <chr>
## 1 2020-03-27 04:28:33 2020-03-27 00:00:00 fascinating positive
## 2 2020-03-27 04:28:33 2020-03-27 00:00:00 available positive
## 3 2020-03-27 04:28:33 2020-03-27 00:00:00 virus negative
## 4 2020-03-27 04:28:33 2020-03-27 00:00:00 hard negative
## 5 2020-03-27 04:28:33 2020-03-27 00:00:00 fucking negative
## 6 2020-03-27 04:28:33 2020-03-27 00:00:00 like positive
## 7 2020-03-27 04:28:33 2020-03-27 00:00:00 shit negative
## 8 2020-03-27 04:27:01 2020-03-27 00:00:00 support positive
## 9 2020-03-27 04:27:01 2020-03-27 00:00:00 like positive
## 10 2020-03-27 04:28:33 2020-03-27 00:00:00 myth negative
## # ... with 1,574,511 more rows
# We can count the usage frequency of positive and negative words in tweets on COVID-19
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, sort=T)
## Joining, by = "word"
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 807340
## 2 positive 767181
# Or we can count the frequency of 'fear' words in tweets on COVID-19
covid19_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
filter(sentiment == "fear") %>%
count(word, sort=T)
## Joining, by = "word"
## # A tibble: 1,427 x 2
## word n
## <chr> <int>
## 1 pandemic 54287
## 2 fight 22013
## 3 government 21564
## 4 death 18399
## 5 medical 17978
## 6 hospital 16391
## 7 case 12735
## 8 emergency 12519
## 9 risk 11825
## 10 watch 11405
## # ... with 1,417 more rows
# We can also summarise different emotions
covid19_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
group_by(sentiment) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
## Joining, by = "word"
## # A tibble: 10 x 2
## sentiment freq
## <chr> <int>
## 1 positive 1214572
## 2 negative 899418
## 3 trust 799147
## 4 fear 619911
## 5 anticipation 569580
## 6 sadness 448993
## 7 joy 383489
## 8 anger 333921
## 9 surprise 261166
## 10 disgust 229475
covid19_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(sentiment, sort=T)
## Joining, by = "word"
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 1214572
## 2 negative 899418
## 3 trust 799147
## 4 fear 619911
## 5 anticipation 569580
## 6 sadness 448993
## 7 joy 383489
## 8 anger 333921
## 9 surprise 261166
## 10 disgust 229475
library(ggplot2)
# Bar chart
covid19_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(sentiment, sort=TRUE) %>%
mutate(sentiment = reorder(sentiment, n)) %>%
ggplot(aes(x=sentiment, y=n)) +
labs(x="Emotion", y="Frequency", title="Bar Chart of Sentiment toward COVID-19") +
geom_bar(stat="identity", width=.5, fill="tomato3")
## Joining, by = "word"
# Pie chart
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, sort=TRUE) %>%
mutate(sentiment = reorder(sentiment, n)) %>%
ggplot(aes(x="", y=n, fill=factor(sentiment))) +
geom_bar(width=1, stat="identity") +
labs(fill="sentiment", x=NULL, y=NULL, title="Pie Chart of Sentiment toward COVID-19") +
coord_polar(theta="y", start=0) +
theme_void()
## Joining, by = "word"
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, sort=TRUE) %>%
mutate(sentiment = factor(sentiment, levels=c("negative","positive"))) %>%
ggplot(aes(x="", y=n, fill=sentiment)) +
geom_bar(width=1, stat="identity") +
labs(fill="sentiment", x=NULL, y=NULL, title="Pie Chart of Sentiment toward COVID-19") +
scale_fill_discrete(name="Sentiment", labels=c("negative: 2009","positive: 2140")) +
coord_polar(theta="y", start=0) +
theme_void()
## Joining, by = "word"
covid19_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
group_by(sentiment) %>%
count(word, sort=T) %>%
top_n(20) %>%
ggplot(aes(reorder(word, n), n, fill=sentiment)) +
geom_bar(stat="identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales="free_y", ncol=5) +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip()
## Joining, by = "word"
## Selecting by n
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
group_by(sentiment) %>%
count(word, sort=T) %>%
top_n(20) %>%
ggplot(aes(reorder(word, n), n, fill=sentiment)) +
geom_bar(stat="identity", show.legend = FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip()
## Joining, by = "word"
## Selecting by n
library(wordcloud)
## Loading required package: RColorBrewer
# Positive words
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>% # Joining with the Bing dataset
filter(!word %in% c("trump", "like","positive","virus")) %>% # Removing irrelevant words to sentiment in this context
group_by(sentiment) %>%
count(word, sort=T) %>%
filter(sentiment=="positive") %>%
with(wordcloud(words = word, # The with( ) function applys an expression to a dataset.
freq = n,
max.words = 100, # Maximum numbers of words plotted
random.order = FALSE, # Highly frequent words placed in the middle
rot.per = 0.2, # Rate of words rotated in plot
scale = c(3, 0.3), # Range of words in size
colors = brewer.pal(8, "Dark2"))) # Retrieve 8 colors from the list of "Dark2"
## Joining, by = "word"
covid19_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% c("trump", "like","positive","virus")) %>%
group_by(sentiment) %>%
count(word, sort=T) %>%
filter(sentiment=="negative") %>%
with(wordcloud(words = word, # The with( ) function applys an expression to a dataset.
freq = n,
max.words = 100, # Maximum numbers of words plotted
random.order = FALSE, # Highly frequent words placed in the middle
rot.per = 0.2, # Rate of words rotated in plot
scale = c(3, 0.3), # Range of words in size
colors = brewer.pal(8, "Dark2"))) # Retrieve 8 colors from the list of "Dark2"
## Joining, by = "word"