Sentiment analysis is a tool for assessing the mood of a piece of text. For example, we can use sentiment analysis to understand public perceptions of topics in environmental policy like energy, climate, and conservation.
library(tidyr) #text analysis in R
library(lubridate) #working with date data
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(pdftools) #read in pdfs
## Using poppler version 22.02.0
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.7
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
library(tidytext)
library(here)
## here() starts at /Users/benjaminmoscona/Documents/eds231_textSent
library(LexisNexisTools) #Nexis Uni data wrangling
## LexisNexisTools Version 0.3.5
library(sentimentr)
library(readr)
library(corpus)
We’ll start by using the Bing sentiment analysis lexicon.
bing_sent <- get_sentiments('bing') #grab the bing sentiment lexicon from tidytext
head(bing_sent, n = 20)
## # A tibble: 20 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## 11 abound positive
## 12 abounds positive
## 13 abrade negative
## 14 abrasive negative
## 15 abrupt negative
## 16 abruptly negative
## 17 abscond negative
## 18 absence negative
## 19 absent-minded negative
## 20 absentee negative
#to follow along with this example, download this .docx to your working directory:
#https://github.com/MaRo406/EDS_231-text-sentiment/blob/main/nexis_dat/Nexis_IPCC_Results.docx
my_files <- list.files(pattern = ".docx", path = getwd(),
full.names = TRUE, recursive = TRUE, ignore.case = TRUE)
dat <- lnt_read(my_files) #Object of class 'LNT output'
## Warning in lnt_asDate(date.v, ...): More than one language was detected. The
## most likely one was chosen (English 84.75%)
meta_df <- dat@meta
articles_df <- dat@articles
paragraphs_df <- dat@paragraphs
dat2<- data_frame(element_id = seq(1:length(meta_df$Headline)), Date = meta_df$Date, Headline = meta_df$Headline)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
# May be of use for assignment: using the full text from the articles
paragraphs_dat <- data_frame(element_id = paragraphs_df$Art_ID, Text = paragraphs_df$Paragraph)
# paragraphs_dat$Text <- text_tokens(paragraphs_dat$Text)
dat3 <- inner_join(dat2,paragraphs_dat, by = "element_id")
custom_stop_words <- bind_rows(tibble(word = c("your_word"),
lexicon = c("custom")),
stop_words)
clean_tokens <- str_replace_all(dat3$Text,"(.*)(((1[0-2]|0?[1-9])\\/(3[01]|[12][0-9]|0?[1-9])\\/(?:[0-9]{2})?[0-9]{2})|((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\\s+\\d{1,2},\\s+\\d{4}))(.*)","")
dat3$text <- clean_tokens
#can we create a similar graph to Figure 3A from Froelich et al.?
text_words <- dat3 %>%
unnest_tokens(output = word, input = Text, token = 'words')
sent_words <- text_words %>% #break text into individual words
anti_join(stop_words, by = 'word') %>% #returns only the rows without stop words
inner_join(bing_sent, by = 'word') #joins and retains only sentiment words
sent_scores <- sent_words %>%
drop_na(Date) %>%
count(sentiment, element_id, Date) %>%
spread(sentiment, n) %>%
replace_na(list(positive = 0, negative = 0)) %>%
mutate(raw_score = positive - negative, #single sentiment score per page
offset = mean(positive - negative), #what is the average sentiment per page?
offset_score = (positive - negative) - offset) %>% #how does this page's sentiment compare to that of the average page?
arrange(desc(raw_score))
sent_scores
## # A tibble: 336 × 7
## element_id Date negative positive raw_score offset offset_score
## <int> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 306 2022-04-18 271 570 299 23.7 275.
## 2 258 2022-04-05 280 572 292 23.7 268.
## 3 218 2022-04-06 286 570 284 23.7 260.
## 4 168 2022-04-07 282 565 283 23.7 259.
## 5 323 2022-04-15 48 257 209 23.7 185.
## 6 1 2022-04-12 48 256 208 23.7 184.
## 7 256 2022-04-06 92 282 190 23.7 166.
## 8 291 2022-04-04 95 284 189 23.7 165.
## 9 324 2022-04-15 56 239 183 23.7 159.
## 10 125 2022-04-11 77 259 182 23.7 158.
## # … with 326 more rows
nrc_sent <- get_sentiments('nrc') #requires downloading a large dataset via prompt
nrc_fear <- get_sentiments("nrc") %>%
filter(sentiment == "fear")
#most common words by sentiment
fear_words <- text_words %>%
inner_join(nrc_fear) %>%
count(word, sort = TRUE)
## Joining, by = "word"
nrc_word_counts <- text_words %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
book_sent_counts <- text_words %>%
drop_na(Date) %>%
group_by(element_id, Date) %>%
# mutate(page_num = 1:n(),
# index = round(page_num / n(), 2)) %>%
#unnest_tokens(word, line) %>%
inner_join(get_sentiments("nrc")) %>%
group_by(sentiment, Date) %>%
count(sentiment, sort = TRUE) %>%
ungroup() %>%
group_by(Date) %>%
mutate(tot = sum(n),
pct = n/tot)
## Joining, by = "word"
book_sent_counts %>%
ggplot(aes(Date, pct, color = sentiment)) + geom_line() +
labs(y = "Percent of Emotion Words", title = "April 2022 Emotions in Articles with keyword: Regenerative Agriculture")
# book_sent_counts %>%
# group_by(sentiment, Date) %>%
# slice_max(n, n = 10) %>%
# ungroup() %>%
# mutate(word = reorder(word, n)) %>%
# ggplot(aes(n, word, fill = sentiment)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~sentiment, scales = "free_y") +
# labs(x = "Contribution to sentiment",
# y = NULL)
Positive and negative as a percent of emotions run opposite of each other, which is reassurring from a robustness standpoint, even though the sentiment labels are not exclusive. Positivity still dominates the other sentiments. I would want to see how this changes over a longer period of time. In this graph, we have 500 articles in April 2022. I would love to use the NEXIS API to download the full set of 7000 articles over the past 5 years. Aroun April 11th, there was a large drop in positivity. I checked articles around this date and saw that it might have been driven down by a low earnings report from Ingredion, which mentions regenerative agriculture.