Week 10 Assignment Data 607 KDD

Sentiment Analysis Introduction:

Assignment was to copy the code from tidytextmining.com then to choose a different corpus and perform the same sentiment analysis as the example provided. Then one additional sentiment lexicon. In this case the book used sentiment lexicons: nrc, bing, and afinn. One additional analysis using sentiment lexion: loughran was done. Ultimately The scripts from The Lord of The Rings movies were used as the additional corpus.

Code was orignially from https://www.tidytextmining.com/sentiment.html

Silge , J., & Robinson, D. (n.d.). 2/Sentiment analysis with tidy data. In tidytextmining. essay. Retrieved April 3, 2024, from https://www.tidytextmining.com/sentiment.html. ISBN-10 1491981652 ISBN-13 978-1491981658

r 1-13 are copied from the chapter. only certain libraries were added.

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.3

library(textdata)

## Warning: package 'textdata' was built under R version 4.3.3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.3.3

## Loading required package: RColorBrewer

library(RCurl)

## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

library(quanteda)

## Warning: package 'quanteda' was built under R version 4.3.3

## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "ndiMatrix" of class "replValueSp"; definition not updated

## Package version: 3.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 20 of 20 threads used.
## See https://quanteda.io for tutorials and examples.

library(readtext)

## Warning: package 'readtext' was built under R version 4.3.3

## 
## Attaching package: 'readtext'
## 
## The following object is masked from 'package:quanteda':
## 
##     texts

get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 4.3.3

library(dplyr)
library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

library(tidyr)

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")

pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)

custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows

library(wordcloud)

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")

p_and_p_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()

austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Performing the same analysis on the scripts for the Lord of the Rings movies.

library(corpustools)

## Warning: package 'corpustools' was built under R version 4.3.3

## 
## Attaching package: 'corpustools'

## The following object is masked from 'package:tidytext':
## 
##     get_stopwords

library(tm)

## Warning: package 'tm' was built under R version 4.3.3

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following objects are masked from 'package:quanteda':
## 
##     meta, meta<-

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## 
## Attaching package: 'tm'

## The following object is masked from 'package:quanteda':
## 
##     stopwords

episode4 <- getURL("https://storage.googleapis.com/kagglesdsdata/datasets/25491/32521/SW_EpisodeIV.txt?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240403%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240403T003858Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=5ad0b59ab9ef17cb90a03d81bcdf43bfd5f42c372b6171849f317925bb374f0df4a273806d01df0b63cfd3de634fd2a6fa31f43d29b788cf468501ba8213713bc74d91f3e5ca3193debcdfd78ca7069244973a903907c28c46f8456939b1fb015a8e069f88ea93d9b1db45e40d9f65466c1688e31e4fd4ae5016b42892ef86d9944c93113640740106f1f50d1433979b7042fcb9c31be80a3aa0cec3341d54f3d922041a381e77361c6f58f167a11baf03e14eee3e8be6d81ae4fe22c5ca618bcb6b89b402de63d8e4cc5cc6d722b52b8f7f58ce60d05102301d86bef3e2634dcd25cf5335953fcdd74d1f5b22035b6804b787cac2b8579f2f256003fb47bd72")

episode4df <- data.frame("line", "character", "dialogue")

# The script for Star wars episode 4 a new hope was also considered.

LOTR <- getURL("https://raw.githubusercontent.com/division-zero/Data607/main/Week%2010%20assignment/lotr_scripts.csv")

LOTRdf <- data.frame(read.csv(text = LOTR))

corp_LOTR <- corpus(LOTRdf, text_field = "dialog")
print(corp_LOTR)

## Corpus consisting of 2,390 documents and 3 docvars.
## text1 :
## "Oh Smeagol Ive got one! , Ive got a fish Smeagol, Smeagol!  ..."
## 
## text2 :
## "Pull it in! Go on, go on, go on, pull it in!  "
## 
## text3 :
## "Arrghh! "
## 
## text4 :
## "Deagol!  "
## 
## text5 :
## "Deagol!  "
## 
## text6 :
## "Deagol!  "
## 
## [ reached max_ndoc ... 2,384 more documents ]

#view(corp_LOTR)
unique(LOTRdf$movie)

## [1] "The Return of the King "     "The Two Towers "            
## [3] "The Fellowship of the Ring "

head(LOTRdf)

##   X    char                                                         dialog
## 1 0  DEAGOL Oh Smeagol Ive got one! , Ive got a fish Smeagol, Smeagol!    
## 2 1 SMEAGOL                 Pull it in! Go on, go on, go on, pull it in!  
## 3 2  DEAGOL                                                       Arrghh! 
## 4 3 SMEAGOL                                                      Deagol!  
## 5 4 SMEAGOL                                                      Deagol!  
## 6 5 SMEAGOL                                                      Deagol!  
##                     movie
## 1 The Return of the King 
## 2 The Return of the King 
## 3 The Return of the King 
## 4 The Return of the King 
## 5 The Return of the King 
## 6 The Return of the King

tidy_LOTR <- LOTRdf %>%
  group_by(movie) %>%
  mutate(
    linenumber = row_number()) %>%
  ungroup() %>%
  unnest_tokens(word, dialog)

head(tidy_LOTR)

## # A tibble: 6 × 5
##       X char   movie                     linenumber word   
##   <int> <chr>  <chr>                          <int> <chr>  
## 1     0 DEAGOL "The Return of the King "          1 oh     
## 2     0 DEAGOL "The Return of the King "          1 smeagol
## 3     0 DEAGOL "The Return of the King "          1 ive    
## 4     0 DEAGOL "The Return of the King "          1 got    
## 5     0 DEAGOL "The Return of the King "          1 one    
## 6     0 DEAGOL "The Return of the King "          1 ive

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_LOTR %>%
  filter(movie == "The Fellowship of the Ring ") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 46 × 2
##    word         n
##    <chr>    <int>
##  1 good        16
##  2 friend      10
##  3 gift         4
##  4 love         4
##  5 peace        4
##  6 birthday     3
##  7 found        3
##  8 merry        3
##  9 precious     3
## 10 grow         2
## # ℹ 36 more rows

tidy_LOTR %>%
  filter(movie == "The Fellowship of the Ring ") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 46 × 2
##    word         n
##    <chr>    <int>
##  1 good        16
##  2 friend      10
##  3 gift         4
##  4 love         4
##  5 peace        4
##  6 birthday     3
##  7 found        3
##  8 merry        3
##  9 precious     3
## 10 grow         2
## # ℹ 36 more rows

tidy_LOTR %>%
  filter(movie == "The Two Towers ") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 55 × 2
##    word         n
##    <chr>    <int>
##  1 good        23
##  2 merry       19
##  3 precious    15
##  4 friend       9
##  5 alive        8
##  6 tree         8
##  7 save         7
##  8 hope         6
##  9 found        5
## 10 child        4
## # ℹ 45 more rows

tidy_LOTR %>%
  filter(movie == "The Return of the King ") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 70 × 2
##    word         n
##    <chr>    <int>
##  1 merry       15
##  2 precious    15
##  3 hope        11
##  4 love        11
##  5 peace        9
##  6 tree         9
##  7 save         8
##  8 victory      8
##  9 good         7
## 10 journey      6
## # ℹ 60 more rows

LOTR Start

The LOTR csv file containing the line from the three movies was put into a dataframe. Each word from the dialog from the scripts were extracted and listed. The joyful words from each movie and their number are listed.

LOTR_sentiment <- tidy_LOTR %>%
  inner_join(get_sentiments("bing")) %>%
  count(movie, index = linenumber %/% 20, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

library(ggplot2)

ggplot(LOTR_sentiment, aes(index, sentiment, fill = movie)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~movie, ncol = 2, scales = "free_x")

the_fellowship <- tidy_LOTR %>% 
  filter(movie == "The Fellowship of the Ring ")

the_fellowship

## # A tibble: 5,465 × 5
##        X char  movie                         linenumber word  
##    <int> <chr> <chr>                              <int> <chr> 
##  1  1400 MERRY "The Fellowship of the Ring "          1 what  
##  2  1400 MERRY "The Fellowship of the Ring "          1 are   
##  3  1400 MERRY "The Fellowship of the Ring "          1 they  
##  4  1400 MERRY "The Fellowship of the Ring "          1 eating
##  5  1400 MERRY "The Fellowship of the Ring "          1 when  
##  6  1400 MERRY "The Fellowship of the Ring "          1 they  
##  7  1400 MERRY "The Fellowship of the Ring "          1 can't 
##  8  1400 MERRY "The Fellowship of the Ring "          1 get   
##  9  1400 MERRY "The Fellowship of the Ring "          1 hobbit
## 10  1402 FRODO "The Fellowship of the Ring "          3 who   
## # ℹ 5,455 more rows

Sentiment across scripts

The sentiment scores were plotted using the bing sentiment lexicon across the movies’ plot, using the scores from 20 lines. The majority seems to score negatively across the movies.

afinn <- the_fellowship %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 20) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  the_fellowship %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  the_fellowship %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 20, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 49 of `x` matches multiple rows in `y`.
## ℹ Row 1627 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

Comparing the sentiment lexicons

We can compare the sentiment analysis from the different sentiment lexicons for the movie:“The Fellowship of the Ring”

bing_word_counts_LOTR <- tidy_LOTR %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

bing_word_counts_LOTR

## # A tibble: 477 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 master   positive     51
##  2 good     positive     46
##  3 dead     negative     45
##  4 death    negative     37
##  5 merry    positive     37
##  6 precious positive     33
##  7 well     positive     31
##  8 like     positive     30
##  9 great    positive     27
## 10 right    positive     25
## # ℹ 467 more rows

List of the Bing sentiments

List of the words and their sentiments from bing for the whole trilogy of movies.

bing_word_counts_LOTR %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

## Bing word counts

How each word contributes to each sentiment score using bing. It lists the most common negative and positive words.

tidy_LOTR %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

## Word Cloud

A word cloud can show the frequency of words in a visual way.

library(reshape2)

tidy_LOTR %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red", "gray20"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Word Cloud continued

A word Cloud can be manipulated to show the sentiment of the words.

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")

wordcountsLOTR <- tidy_LOTR %>%
  group_by(movie) %>%
  summarize(words = n())

tidy_LOTR %>%
  semi_join(bingnegative) %>%
  group_by(movie) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcountsLOTR, by = c("movie")) %>%
  mutate(ratio = negativewords/words) %>%
  filter() %>%
  slice_max(ratio, n = 3) %>% 
  ungroup()

## Joining with `by = join_by(word)`

## # A tibble: 3 × 4
##   movie                         negativewords words  ratio
##   <chr>                                 <int> <int>  <dbl>
## 1 "The Return of the King "               398  9536 0.0417
## 2 "The Two Towers "                       374  9893 0.0378
## 3 "The Fellowship of the Ring "           172  5465 0.0315

Most negative words

The Return of the King had the most negative words for the three movies according to bing.

##One additional Sentiment Lexicon “loughran”

library(syuzhet)

## Warning: package 'syuzhet' was built under R version 4.3.3

unique(get_sentiments("loughran")[,2])

## # A tibble: 6 × 1
##   sentiment   
##   <chr>       
## 1 negative    
## 2 positive    
## 3 uncertainty 
## 4 litigious   
## 5 constraining
## 6 superfluous

loughran_word_counts_LOTR <- tidy_LOTR %>%
  inner_join(get_sentiments("loughran")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1741 of `x` matches multiple rows in `y`.
## ℹ Row 2826 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

loughran_word_counts_LOTR

## # A tibble: 202 × 3
##    word     sentiment       n
##    <chr>    <chr>       <int>
##  1 good     positive       46
##  2 shall    litigious      32
##  3 could    uncertainty    30
##  4 great    positive       27
##  5 may      uncertainty    21
##  6 lost     negative       19
##  7 fear     negative       17
##  8 against  negative       15
##  9 destroy  negative       12
## 10 strength positive       12
## # ℹ 192 more rows

loughran_word_counts_LOTR %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

## Conclusion:

Loughran has additional sentiments and their frequency of the words within the three movies can be plotted. In this assignment we have shown how sentiment analysis used in tidytextmining.com book can be used on a different corpus namely the Lord of the Rings movie scripts.