Tidying tweet data set
Joining with Bing sentiment lexicon
Counting the frequency of positive and negative sentiment words in each tweet
Calculating the sentiment score for each tweet by subtracting the frequency of negative words from the frequency of positive words
Determining the tweet sentiment by the net sentiment score
We know how to convert our tweet data into a tidy text data format.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.3 √ purrr 0.3.4
## √ tibble 3.1.0 √ dplyr 1.0.4
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
library(stopwords)
load("covid_tweets_423.RData")
covid_tweets # This dataset contains 18,224 tweets about COVID-19 including geo-location information.
## # A tibble: 18,224 x 9
## user_id status_id created_at screen_name text lang country lat
## <chr> <chr> <dttm> <chr> <chr> <chr> <chr> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ "@ev~ en United~ 36.0
## 2 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ "Ple~ en United~ 36.9
## 3 2155830~ 12533658~ 2020-04-23 16:51:09 KOROGLU_BA~ "@Ay~ tr Azerba~ 40.2
## 4 7445974~ 12533657~ 2020-04-23 16:51:05 FoodFocusSA "Pre~ en South ~ -26.1
## 5 1558777~ 12533657~ 2020-04-23 16:51:01 opcionsecu~ "#AT~ es Ecuador -1.67
## 6 9989605~ 12533657~ 2020-04-23 16:51:01 amystones4 "Tha~ en United~ 53.7
## 7 1027687~ 12533657~ 2020-04-23 16:51:00 COTACYT "Men~ es Mexico 23.7
## 8 2473827~ 12533657~ 2020-04-23 16:50:54 bkracing123 "The~ en United~ 53.9
## 9 17566234 12533657~ 2020-04-23 16:50:51 AnnStrahm "Thi~ en United~ 37.5
## 10 2267079~ 12533656~ 2020-04-23 16:50:42 JLeonRojas "INF~ es Chile -35.5
## # ... with 18,214 more rows, and 1 more variable: lng <dbl>
# Let's preprocess tweet text
covid_tweets_tidy <- covid_tweets %>%
filter(lang == "en") %>% # Selecting tweets only written in English
mutate(hour = floor_date(created_at, unit="hour")) %>% # Creating a variable to aggregate tweets into the hour-long unit of time
mutate(text = str_replace_all(text, "[#@]?[^[:ascii:]]+", " ")) %>% # Removing non-ASCII characters
mutate(text = str_replace_all(text, "&|<|>|"|RT", " ")) %>% # Removing HTML tags and retweet marker
unnest_tweets(word, text) %>% # Splitting text into words by unnest_tweets
filter(!word %in% stopwords()) %>% # Removing words matched by any element in stopwords() vector
filter(str_detect(word, "[a-z]")) # Selecting words that should contain any alphbetical letter
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
covid_tweets_tidy
## # A tibble: 167,641 x 10
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 4794913~ 125336581~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## # ... with 167,631 more rows, and 2 more variables: hour <dttm>, word <chr>
When text is formed into tidy data, we are ready to do sentiment analysis using inner_join
library(textdata)
lexicon_bing()
## # A tibble: 6,787 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,777 more rows
bing_lexicon <- lexicon_bing()
covid_tweets_tidy %>%
inner_join(bing_lexicon, by="word") # Joining with the bing lexicon by the "word" column. A character vector of variables to join by; using a variable with a common name across the two data sets.
## # A tibble: 15,936 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 479491~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 169480~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## 4 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 5 998960~ 12533657~ 2020-04-23 16:51:01 amystones4 en United~ 53.7 -1.65
## 6 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 7 247382~ 12533657~ 2020-04-23 16:50:54 bkracing123 en United~ 53.9 -1.21
## 8 175662~ 12533657~ 2020-04-23 16:50:51 AnnStrahm en United~ 37.5 -121.
## 9 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## 10 314986~ 12533656~ 2020-04-23 16:50:39 dande_hema~ en India 15.9 80.8
## # ... with 15,926 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
covid_tweets_tidy %>%
inner_join(bing_lexicon, by="word") %>%
count(sentiment)
## # A tibble: 2 x 2
## sentiment n
## * <chr> <int>
## 1 negative 7384
## 2 positive 8552
lexicon_nrc()
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
nrc_lexicon <- lexicon_nrc()
covid_tweets_tidy %>%
inner_join(nrc_lexicon)
## Joining, by = "word"
## # A tibble: 57,217 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## # ... with 57,207 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
In case you have a problem in loading lexicon_nrc()
from textdata
package…
library(tidytext)
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
nrc_lexicon <- get_sentiments("nrc")
covid_tweets_tidy %>%
inner_join(nrc_lexicon) # Joining by "word"
## Joining, by = "word"
## # A tibble: 57,217 x 11
## user_id status_id created_at screen_name lang country lat lng
## <chr> <chr> <dttm> <chr> <chr> <chr> <dbl> <dbl>
## 1 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 2 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 3 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 4 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 5 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 6 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 7 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 8 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 9 4794913~ 12533658~ 2020-04-23 16:51:11 Vegastechh~ en United~ 36.0 -115.
## 10 1694802~ 12533658~ 2020-04-23 16:51:11 Coachjmorr~ en United~ 36.9 -81.1
## # ... with 57,207 more rows, and 3 more variables: hour <dttm>, word <chr>,
## # sentiment <chr>
After joining a tidy text data set with a sentiment lexicon, we can count the sentiment variable by the time variable, hour
# Counting sentiments by hour
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(hour, sentiment) # Counting sentiments by hour
## Joining, by = "word"
## # A tibble: 40 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 negative 516
## 2 2020-04-22 21:00:00 positive 525
## 3 2020-04-22 22:00:00 negative 524
## 4 2020-04-22 22:00:00 positive 507
## 5 2020-04-22 23:00:00 negative 349
## 6 2020-04-22 23:00:00 positive 418
## 7 2020-04-23 00:00:00 negative 356
## 8 2020-04-23 00:00:00 positive 405
## 9 2020-04-23 01:00:00 negative 370
## 10 2020-04-23 01:00:00 positive 397
## # ... with 30 more rows
covid_tweets_tidy %>%
inner_join(nrc_lexicon) %>%
count(hour, sentiment)
## Joining, by = "word"
## # A tibble: 200 x 3
## hour sentiment n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 anger 223
## 2 2020-04-22 21:00:00 anticipation 379
## 3 2020-04-22 21:00:00 disgust 179
## 4 2020-04-22 21:00:00 fear 384
## 5 2020-04-22 21:00:00 joy 280
## 6 2020-04-22 21:00:00 negative 577
## 7 2020-04-22 21:00:00 positive 720
## 8 2020-04-22 21:00:00 sadness 289
## 9 2020-04-22 21:00:00 surprise 201
## 10 2020-04-22 21:00:00 trust 462
## # ... with 190 more rows
Let’s visualize the time trend of sentiment in tweets toward COVID-19
library(ggplot2)
library(ggthemes)
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum of Words",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(nrc_lexicon) %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
scale_colour_brewer(palette="Paired") +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
What are the most common sentiment words used in the tweets? We can analyze word counts that contribute to positive and negative sentiment in tweets. By implementing count()
here with arguments of both word
and sentiment
, we find out how much each word contributed to each sentiment.
# Word count on tweets
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(sentiment, word, sort=TRUE) # Counting words by sentiments
## Joining, by = "word"
## # A tibble: 1,973 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 negative virus 282
## 6 positive thank 255
## 7 positive safe 234
## 8 positive well 232
## 9 positive trump 223
## 10 negative crisis 212
## # ... with 1,963 more rows
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="positive") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 751 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 positive like 460
## 2 positive good 337
## 3 positive work 316
## 4 positive positive 301
## 5 positive thank 255
## 6 positive safe 234
## 7 positive well 232
## 8 positive trump 223
## 9 positive support 205
## 10 positive great 187
## # ... with 741 more rows
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(sentiment, word, sort=TRUE) %>%
filter(sentiment=="negative") %>%
arrange(desc(n))
## Joining, by = "word"
## # A tibble: 1,222 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 negative virus 282
## 2 negative crisis 212
## 3 negative death 197
## 4 negative died 133
## 5 negative hard 98
## 6 negative infected 98
## 7 negative lost 85
## 8 negative die 84
## 9 negative risk 83
## 10 negative sick 83
## # ... with 1,212 more rows
The words like, positive, & trump are to be removed from the list of positive words because their meaning is not related to positive feelings in the context of COVID-19; And I also want to remove the word virus from the list of negative words because it is likely to be used in a way of indicating “Coronavirus”.
covid_tweets_tidy %>%
inner_join(nrc_lexicon) %>%
count(sentiment, word, sort=TRUE) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
arrange(sentiment, desc(n)) %>%
ungroup
## Joining, by = "word"
## # A tibble: 102 x 3
## sentiment word n
## <chr> <chr> <int>
## 1 anger fight 217
## 2 anger death 197
## 3 anger money 119
## 4 anger fighting 115
## 5 anger disease 85
## 6 anger hit 73
## 7 anger dying 72
## 8 anger bad 69
## 9 anger feeling 51
## 10 anger challenge 44
## # ... with 92 more rows
The words virus in “negative”, don in “positive” and “trust”, trump in “surprise” are to be excluded from the analysis using the NRC lexicon.
words_out <- c("like", "positive", "trump", "virus", "don")
words_out
## [1] "like" "positive" "trump" "virus" "don"
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(nrc_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
scale_colour_brewer(palette="Paired") +
theme_bw() +
labs(x = NULL, y = "Hourly Sum",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The NRC Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
arrange(hour)
## Joining, by = "word"
## # A tibble: 8,929 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 8,919 more rows
Calculating the net score (positive - negative) using the “Bing” lexicon
*Let’s consider how we can calculate the net score of sentiment by tweet: Sum of positive words minus sum of negative words in each tweet
*To do so, we need to have two separate columns for positive and negative scores
*And, there will be also some days with no emotional words in tweets
*So, we will use spread()
from tidyr package
spread()
takes three principal arguments:Objective: Reshaping long format to wide format
Description: There are times when we are required to turn long formatted data into wide formatted data. The spread() function spreads a key-value pair across multiple columns.
Spread() function
This yields a frequency table where the observations of sentiment for each tweet are spread across multiple rows, 9,559 observations from 6,866 tweets of 4 variables (hour
,status_id
, sentiment
, n
)
library(tibble)
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, status_id)
## Joining, by = "word"
## # A tibble: 6,866 x 3
## hour status_id n
## <dttm> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 6
## 2 2020-04-22 21:00:00 1253067622519681024 1
## 3 2020-04-22 21:00:00 1253067629759209472 2
## 4 2020-04-22 21:00:00 1253067638491807748 2
## 5 2020-04-22 21:00:00 1253067644388982785 4
## 6 2020-04-22 21:00:00 1253067656690839552 4
## 7 2020-04-22 21:00:00 1253067673103204353 1
## 8 2020-04-22 21:00:00 1253067716015071232 1
## 9 2020-04-22 21:00:00 1253067749305323525 2
## 10 2020-04-22 21:00:00 1253067791227392002 1
## # ... with 6,856 more rows
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(hour, status_id, sentiment)
## Joining, by = "word"
## # A tibble: 9,559 x 4
## hour status_id sentiment n
## <dttm> <chr> <chr> <int>
## 1 2020-04-22 21:00:00 1253067601535815680 negative 5
## 2 2020-04-22 21:00:00 1253067601535815680 positive 1
## 3 2020-04-22 21:00:00 1253067622519681024 negative 1
## 4 2020-04-22 21:00:00 1253067629759209472 negative 2
## 5 2020-04-22 21:00:00 1253067638491807748 negative 2
## 6 2020-04-22 21:00:00 1253067644388982785 negative 1
## 7 2020-04-22 21:00:00 1253067644388982785 positive 3
## 8 2020-04-22 21:00:00 1253067656690839552 positive 4
## 9 2020-04-22 21:00:00 1253067673103204353 positive 1
## 10 2020-04-22 21:00:00 1253067716015071232 positive 1
## # ... with 9,549 more rows
library(tidyr)
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0)
## Joining, by = "word"
## # A tibble: 7,203 x 4
## hour status_id negative positive
## <dttm> <chr> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1
## 2 2020-04-22 21:00:00 1253067622519681024 1 0
## 3 2020-04-22 21:00:00 1253067629759209472 2 0
## 4 2020-04-22 21:00:00 1253067638491807748 2 0
## 5 2020-04-22 21:00:00 1253067644388982785 1 3
## 6 2020-04-22 21:00:00 1253067656690839552 0 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 1
## 10 2020-04-22 21:00:00 1253067782016688128 0 1
## # ... with 7,193 more rows
Using spread()
to key on sentiment
with values from n
, this becomes 7,203 observations of 4 variables (hour
,status_id
, negative
, positive
)
The function spread()
takes two columns (“key” and “value”) and spreads into multiple columns. It produces a “wide” data format from a “long” one.
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <dbl>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 -4
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 -1
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 -2
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 -2
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 2
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 4
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 1
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 1
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 -2
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 1
## # ... with 6,856 more rows
Assigning each tweet with either positive or negative sentiment by the net score
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <chr>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 Negative
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 Negative
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 Negative
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 Negative
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 Positive
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 Positive
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 Positive
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 Positive
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 Negative
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 Positive
## # ... with 6,856 more rows
# ifelse(test, yes, no) returns a value with the same shape as test which is filled with elements selected from either yes or no depending on whether the element of test is TRUE or FALSE.
covid_tweets_bing <- covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, status_id, sentiment) %>%
spread(key=sentiment, value=n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(sentiment = ifelse(sentiment > 0, "Positive",
ifelse(sentiment < 0, "Negative", "Neutral")))
## Joining, by = "word"
covid_tweets_bing
## # A tibble: 6,866 x 5
## hour status_id negative positive sentiment
## <dttm> <chr> <dbl> <dbl> <chr>
## 1 2020-04-22 21:00:00 1253067601535815680 5 1 Negative
## 2 2020-04-22 21:00:00 1253067622519681024 1 0 Negative
## 3 2020-04-22 21:00:00 1253067629759209472 2 0 Negative
## 4 2020-04-22 21:00:00 1253067638491807748 2 0 Negative
## 5 2020-04-22 21:00:00 1253067644388982785 1 3 Positive
## 6 2020-04-22 21:00:00 1253067656690839552 0 4 Positive
## 7 2020-04-22 21:00:00 1253067673103204353 0 1 Positive
## 8 2020-04-22 21:00:00 1253067716015071232 0 1 Positive
## 9 2020-04-22 21:00:00 1253067749305323525 2 0 Negative
## 10 2020-04-22 21:00:00 1253067791227392002 0 1 Positive
## # ... with 6,856 more rows
# Now we are going to plot these net sentiment scores across hour-long bins. Note that we are plotting against the hour variable on the x-axis that keeps track of posted time in tweets
covid_tweets_bing %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum of Tweets",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
covid_tweets_tidy %>%
inner_join(bing_lexicon) %>%
filter(!word %in% words_out) %>%
count(hour, sentiment) %>%
arrange(hour) %>%
ggplot(aes(x=hour, y=n, colour=sentiment)) +
geom_line() +
theme_bw() +
labs(x = NULL, y = "Hourly Sum of Words Used",
title = "Tracing the rhythm of expressing sentiments toward COVID-19 on Twitter",
subtitle = "The Bing Lexicon was used to measure sentiment in tweets")
## Joining, by = "word"