Introduction

In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways: 1. Work with a different corpus of your choosing, and 2. Incorporate at least one additional sentiment lexicon

Citation

Code from text part is taken directly from the text book as below Text Mining with R: A Tidy Approach, Julia Silge and David Robinson. O’Reilly, 2017

Part 1

Loading Libraries

library(tidytext)
library(RCurl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janeaustenr)
library(stringr)
library(textdata)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
library(ggplot2)
library(rjson)
library(jsonlite)
## 
## Attaching package: 'jsonlite'
## The following objects are masked from 'package:rjson':
## 
##     fromJSON, toJSON
library(httr)
## 
## Attaching package: 'httr'
## The following object is masked from 'package:textdata':
## 
##     cache_info
library(XML)
library(rvest)
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
library(RColorBrewer)
library(rtweet)
## 
## Attaching package: 'rtweet'
## The following object is masked from 'package:jsonlite':
## 
##     flatten
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate

Use afinn lexicon to get the sentiments.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.7     ✔ purrr   0.3.4
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ NLP::annotate()         masks ggplot2::annotate()
## ✖ httr::cache_info()      masks textdata::cache_info()
## ✖ tidyr::complete()       masks RCurl::complete()
## ✖ NLP::content()          masks httr::content()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ purrr::flatten()        masks rtweet::flatten(), jsonlite::flatten()
## ✖ jsonlite::fromJSON()    masks rjson::fromJSON()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ jsonlite::toJSON()      masks rjson::toJSON()
library(tidytext)

tns <- getNamespace("textdata")
assignInNamespace(x = "printer", value = function(...) 1, ns = tns)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # … with 2,467 more rows

The below code uses nrc lexicon to get the sentiments.

get_sentiments("nrc")
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # … with 13,862 more rows

The below code uses bing lexicon to get the sentiments.

get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # … with 6,776 more rows

Get the code from the text book

library(janeaustenr)
library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
head(tidy_books, 5) %>% knitr::kable()
book linenumber chapter word
Sense & Sensibility 1 0 sense
Sense & Sensibility 1 0 and
Sense & Sensibility 1 0 sensibility
Sense & Sensibility 3 0 by
Sense & Sensibility 3 0 jane

Using sentiment dictionary nrc on te book exercise

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # … with 291 more rows

Using sentiment dictionary bing on the book exercise

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

## Comparing the three sentiment dictionaries

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"

combining the three dictionaries, afinn, bing and nrc.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"

## Negative vs positive analysis using bing dictionary

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("blue", "green"),
                   max.words = 100)
## Joining, by = "word"

Part 2

Below example is a sentimental analysis of Ukranian president Vladimir Zelinski’s address to the United States Congress.

The text of his address was taken from the Washington post article available at: https://www.washingtonpost.com/politics/2022/03/16/text-zelensky-address-congress/

speech_website<- read_html("https://www.washingtonpost.com/politics/2022/03/16/text-zelensky-address-congress/")
speech <- speech_website %>%
html_nodes("p") %>%
html_text()
library(syuzhet)
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:rtweet':
## 
##     get_tokens
get_sentiment(speech[2:50])
##  [1]  0.00  0.00  0.25  1.55  2.90  0.45  5.15  7.55 -0.15 -1.15  3.75  2.25
## [13] -1.70 -1.75  0.45 -0.90  3.10  1.45  1.90  0.50  0.00  0.00  0.00  0.00
## [25]  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
## [37]  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
## [49]  0.00
knitr::kable(get_nrc_sentiment(speech[2:50]))
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `spread()` instead.
## ℹ The deprecated feature was likely used in the syuzhet package.
##   Please report the issue to the authors.
anger anticipation disgust fear joy sadness surprise trust negative positive
0 0 0 0 0 0 0 0 0 0
1 2 1 0 0 0 0 2 1 2
0 0 0 0 0 0 0 0 0 1
0 1 0 0 1 0 0 1 0 1
0 2 1 2 3 0 0 3 1 3
3 0 0 2 2 1 0 3 3 4
2 1 1 1 3 0 0 3 2 7
0 1 0 0 2 0 1 5 0 6
3 1 3 3 0 4 1 3 4 3
2 2 1 4 3 3 2 2 5 6
2 2 1 4 3 0 0 3 1 9
4 2 0 4 0 0 0 4 5 9
4 3 1 4 2 2 2 10 9 10
4 2 2 4 1 3 1 2 6 9
3 4 1 4 4 3 2 9 8 11
1 1 0 3 0 0 0 0 1 1
1 5 0 2 2 1 0 5 3 4
1 1 0 0 1 0 0 2 1 3
1 1 0 1 1 1 0 4 1 5
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
s_v <- get_sentences(speech[2:50])
s_v_sentiment <- get_sentiment(s_v)
s_v_sentiment
##   [1]  0.00  0.00  0.00  0.00  0.25  1.55  0.50  1.55  0.85  0.45  2.80 -2.00
##  [13]  5.15  6.00  0.00  0.75  0.80 -1.50  0.00 -1.70  1.30 -0.40  1.40 -1.25
##  [25] -0.50 -0.35 -1.00  1.10  0.75  0.10  0.75  2.75  0.00  0.25  0.25  0.00
##  [37]  0.75  0.25  1.20  2.90 -0.75 -0.10  0.40 -0.75 -2.15 -1.05  1.25 -0.50
##  [49] -1.25  0.60  1.75 -0.25 -0.60  0.00 -0.90  0.00  0.00  0.90  5.20 -0.15
##  [61]  0.95 -0.70 -1.25  0.00 -0.40 -1.50  1.00 -0.50  1.00  1.35  0.75  2.50
##  [73] -0.55  2.00  0.00 -0.90 -0.40  1.15  0.75  1.05  1.00  0.50  0.00  0.00
##  [85]  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
##  [97]  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.00
## [109]  0.00  0.00  0.00  0.00
knitr::kable(get_sentiments("nrc") %>% count(sentiment, sort = TRUE) )
sentiment n
negative 3316
positive 2308
fear 1474
anger 1245
trust 1230
sadness 1187
disgust 1056
anticipation 837
joy 687
surprise 532
tidy_speech <- speech[2:50]

tidy_speech_words <- unlist(as.list(strsplit(tidy_speech, " ")))
rowNumber <- seq(1:length(tidy_speech_words))
words.df <- data.frame(rowNumber, tidy_speech_words)
names(words.df) <- c("rowNumber","word")
speech_sentiment_quanteda<- words.df %>% inner_join(get_sentiments("nrc"))
## Joining, by = "word"
lang_word_counts <- words.df %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"

This code breaks the words into different categories such as anger, anticipation, fear, and sadness.

lang_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Major wordacounts of this speech

library(reshape2)

lang_word_counts %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("blue", "green"),
                   max.words = 100)
## Joining, by = c("word", "sentiment")
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## terrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## humanitarian could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## punished could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## attacking could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## death could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## beautiful could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## freedom could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## freely could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## kind could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## save could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## peace could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## proud could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## fell could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## ongoing could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## seek could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## thought could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## time could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## die could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## kill could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## depend could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## watch could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## conflict could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## resisting could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## terrorize could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## defense could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## invasion could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## unjust could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## words could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## moral could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## aggression could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## aggressor could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## brutal could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## fight could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## experienced could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## honor could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## important could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## innocent could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## justice could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## leader could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## provide could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## respects could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## responsible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## sincere could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## strength could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## battlefield could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## government could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## democracy could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## foundation could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## grateful could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## offer could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## overwhelming could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## preserve could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## protect could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 100):
## sense could not be fit on page. It will not be plotted.

## wordacounts of positive and negative words as group.

library(reshape2)

lang_word_counts %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("blue", "green"),
                   max.words = 500)
## Joining, by = c("word", "sentiment")
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## important could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## peace could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## protect could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## proud could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## offensive could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## terrible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., colors = c("blue", "green"), max.words = 500):
## terrorize could not be fit on page. It will not be plotted.

Conlcusion:

The sentimental analysis of Ukranian President Zelenski’s address to congress seems to be in line with the situation. He has used words of anger repeatedly. He is fearful but also positive in the leadership of the US.The sentimental analysis would be helpful for the members of congress and the White House in formulating the right policy towards Ukraine.