We know how to convert our tweet data into a tidy text data format.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.0 √ purrr 0.3.4
## √ tibble 3.0.0 √ dplyr 0.8.5
## √ tidyr 1.0.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(stopwords)
load("covid_tweets_423.RData")
covid_tweets # This dataset contains 18,224 tweets about COVID-19 including geo-location information.
## # A tibble: 18,224 x 9
## user_id status_id created_at screen_name text lang country lat
## <chr> <chr> <dttm> <chr> <chr> <chr> <chr> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ "@ev~ en United~ 36.0
## 2 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ "Ple~ en United~ 36.9
## 3 215583~ 12533658~ 2020-04-23 16:51:09 KOROGLU_BA~ "@Ay~ tr Azerba~ 40.2
## 4 744597~ 12533657~ 2020-04-23 16:51:05 FoodFocusSA "Pre~ en South ~ -26.1
## 5 155877~ 12533657~ 2020-04-23 16:51:01 opcionsecu~ "#AT~ es Ecuador -1.67
## 6 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 "Tha~ en United~ 53.7
## 7 102768~ 12533657~ 2020-04-23 16:51:00 COTACYT "Men~ es Mexico 23.7
## 8 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 "The~ en United~ 53.9
## 9 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm "Thi~ en United~ 37.5
## 10 226707~ 12533656~ 2020-04-23 16:50:42 JLeonRojas "INF~ es Chile -35.5
## # ... with 18,214 more rows, and 1 more variable: lng <dbl>
covid_tweets_tidy <- covid_tweets %>%
filter(lang == "en") %>% # Selecting tweets only written in English
mutate(hour = floor_date(created_at, unit="hour")) %>% # Creating a variable to aggregate tweets into the hour-long unit of time
mutate(text = str_replace_all(text, "[#@]?[^[:ascii:]]+", " ")) %>% # Removing non-ASCII characters
mutate(text = str_replace_all(text, "&|<|>|"|RT", " ")) %>% # Removing HTML tags and retweet marker
unnest_tweets(word, text) %>% # Splitting text into words by unnest_tweets
filter(!word %in% stopwords()) %>% # Removing words matched by any element in stopwords() vector
filter(str_detect(word, "[a-z]")) # Selecting words that should contain any alphbetical letter
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
covid_tweets_tidy
## # A tibble: 167,641 x 10
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## # ... with 167,631 more rows, and 2 more variables: hour <dttm>, word <chr>
When text is formed into tidy data, we are ready to do sentiment analysis using inner_join
library(textdata)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) # Joining with the bing lexicon
## Joining, by = "word"
## # A tibble: 15,936 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 4 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 5 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 6 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 7 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 8 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 9 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## 10 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## # ... with 15,926 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
covid_tweets_tidy %>%
inner_join(lexicon_nrc())
## Joining, by = "word"
## # A tibble: 57,217 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## # ... with 57,207 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) # Joining with the NRC-EIL by "word"="term". A character vector of variables to join by; using a variable with a common name across the two data sets; To join by a variable with different names, we specify that the variable "word" in covid_tweets_tidy is matched to the variable "term" in lexicon_nrc_eil().
## # A tibble: 23,895 x 12
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 5 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 6 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 7 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 8 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 9 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 10 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## # ... with 23,885 more rows, and 4 more variables: hour <dttm>, word <chr>,
## # score <dbl>, AffectDimension <chr>
After joining a tidy text data set with a sentiment lexicon, we can count the sentiment variable by the time variable
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, sentiment) # Counting sentiments by hour
## Joining, by = "word"
## # A tibble: 40 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 negative 516
## 2 2020-04-22 21:00:00 positive 525
## 3 2020-04-22 22:00:00 negative 524
## 4 2020-04-22 22:00:00 positive 507
## 5 2020-04-22 23:00:00 negative 349
## 6 2020-04-22 23:00:00 positive 418
## 7 2020-04-23 00:00:00 negative 356
## 8 2020-04-23 00:00:00 positive 405
## 9 2020-04-23 01:00:00 negative 370
## 10 2020-04-23 01:00:00 positive 397
## # ... with 30 more rows
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(hour, sentiment)
## Joining, by = "word"
## # A tibble: 200 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 anger 223
## 2 2020-04-22 21:00:00 anticipation 379
## 3 2020-04-22 21:00:00 disgust 179
## 4 2020-04-22 21:00:00 fear 384
## 5 2020-04-22 21:00:00 joy 280
## 6 2020-04-22 21:00:00 negative 577
## 7 2020-04-22 21:00:00 positive 720
## 8 2020-04-22 21:00:00 sadness 289
## 9 2020-04-22 21:00:00 surprise 201
## 10 2020-04-22 21:00:00 trust 462
## # ... with 190 more rows
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(hour, AffectDimension)
## # A tibble: 80 x 3
## hour AffectDimension n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 anger 237
## 2 2020-04-22 21:00:00 fear 467
## 3 2020-04-22 21:00:00 joy 536
## 4 2020-04-22 21:00:00 sadness 338
## 5 2020-04-22 22:00:00 anger 244
## 6 2020-04-22 22:00:00 fear 478
## 7 2020-04-22 22:00:00 joy 548
## 8 2020-04-22 22:00:00 sadness 330
## 9 2020-04-22 23:00:00 anger 146
## 10 2020-04-22 23:00:00 fear 320
## # ... with 70 more rows
Let’s visualize the time trend of sentiment in tweets toward COVID-19
library(ggplot2)
library(ggthemes)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(hour, AffectDimension) %>%
ggplot(aes(x=hour, y=n, colour=AffectDimension)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC-EIL was used to measure sentiment in tweets")
We can analyze word counts that contribute to positive and negative sentiment in tweets. By implementing count()
here with arguments of both word
and sentiment
, we find out how much each word contributed to each sentiment.
# Word count on tweets
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) # Counting words by sentiments
## Joining, by = "word"
## # A tibble: 1,973 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 negative virus 282
## 6 positive thank 255
## 7 positive safe 234
## 8 positive well 232
## 9 positive trump 223
## 10 negative crisis 212
## # ... with 1,963 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="positive") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 751 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 positive thank 255
## 6 positive safe 234
## 7 positive well 232
## 8 positive trump 223
## 9 positive support 205
## 10 positive great 187
## # ... with 741 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="negative") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 1,222 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 negative virus 282
## 2 negative crisis 212
## 3 negative death 197
## 4 negative died 133
## 5 negative hard 98
## 6 negative infected 98
## 7 negative lost 85
## 8 negative die 84
## 9 negative risk 83
## 10 negative sick 83
## # ... with 1,212 more rows
The words like, positive, & trump are to be removed from the list of positive words because their meaning is not related to positive feelings in the context of COVID-19; And I also want to remove the word virus from the list of negative words because it is likely to be used in a way of indicating “Coronavirus”.
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
count(sentiment, word, sort=TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
arrange(sentiment, desc(n)) %>%
ungroup
## Joining, by = "word"
## Selecting by n
## # A tibble: 102 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 anger fight 217
## 2 anger death 197
## 3 anger money 119
## 4 anger fighting 115
## 5 anger disease 85
## 6 anger hit 73
## 7 anger dying 72
## 8 anger bad 69
## 9 anger feeling 51
## 10 anger challenge 44
## # ... with 92 more rows
The words virus in “negative”, don in “positive” and “trust”, trump in “surprise” are to be excluded from the analysis using the NRC lexicon.
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
count(AffectDimension, word) %>%
group_by(AffectDimension) %>%
top_n(10) %>%
arrange(AffectDimension, desc(n)) %>%
ungroup
## Selecting by n
## # A tibble: 40 x 3
## AffectDimension word n
## <chr> <chr> <int>
## 1 anger fight 217
## 2 anger death 197
## 3 anger money 119
## 4 anger fighting 115
## 5 anger disease 85
## 6 anger hit 73
## 7 anger dying 72
## 8 anger bad 69
## 9 anger feeling 51
## 10 anger challenge 44
## # ... with 30 more rows
The words positive in “joy”, don in “positive” and “trust”, trump in “surprise” are to be excluded from the analysis using the NRC lexicon.
words_out <- c("like", "positive", "trump", "virus", "don")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(lexicon_nrc_eil(), by=c("word"="term")) %>%
filter(!word %in% words_out) %>%
count(hour, AffectDimension) %>%
ggplot(aes(x=hour, y=n, colour=AffectDimension)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC-EIL was used to measure sentiment in tweets")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment)
## Joining, by = "word"
## # A tibble: 8,929 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 8,919 more rows
*Let’s consider how we can calculate the net score of sentiment by tweet: Sum of positive words minus sum of negative words in each tweet
*To do so, we need to have two separate columns for positive and negative scores
*And, there will be also some days with no emotional words in tweets
*So, we will use spread()
from tidyr package
spread()
takes three principal arguments:Spread() function
This yields a frequency table where the observations of sentiment for each tweet are spread across multiple rows, 9,559 observations from 7,203 tweets of 4 variables (hour
,status_id
, sentiment
, n
)
library(tibble)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, status_id, sentiment)
## Joining, by = "word"
## # A tibble: 9,559 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 9,549 more rows
Using spread()
to key on sentiment
with values from n
, this becomes 7,203 observations of 4 variables (hour
,status_id
, negative
, positive
)
library(tidyr)
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0)
## Joining, by = "word"
## # A tibble: 7,203 x 4
## hour status_id negative positive
## <dttm> <chr> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1
## 2 2020-04-22 21:00:00 1253067622519681024 1 0
## 3 2020-04-22 21:00:00 1253067629759209472 2 0
## 4 2020-04-22 21:00:00 1253067638491807748 2 0
## 5 2020-04-22 21:00:00 1253067644388982785 1 3
## 6 2020-04-22 21:00:00 1253067656690839552 0 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 1
## 10 2020-04-22 21:00:00 1253067782016688128 0 1
## # ... with 7,193 more rows
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 -4
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 -1
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 -2
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 -2
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 2
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 -2
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 1
## # ... with 6,856 more rows
# Assigning each tweet with either positive or negative sentiment by the net score
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <chr>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 Negative
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 Negative
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 Negative
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 Negative
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 Positive
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 Positive
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 Positive
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 Positive
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 Negative
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 Positive
## # ... with 6,856 more rows
# ifelse(test, yes, no) returns a value with the same shape as test which is filled with elements selected from either yes or no depending on whether the element of test is TRUE or FALSE.
covid_tweets_bing <- covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
# Now we are going to plot these net sentiment scores across hour-long bins. Note that we are plotting against the hour variable on the x-axis that keeps track of posted time in tweets
covid_tweets_bing %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour,y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum of Tweets",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
covid_tweets_tidy %>%
inner_join(lexicon_bing()) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"