Required package and librires:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(janeaustenr)
library(wordcloud)
## Loading required package: RColorBrewer
library(gutenbergr)
library(dplyr)
library(tidyr)

AFINN Sentiment:

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

Download the books from gutenberg package:

gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>

From the list of books I have selected The United States Constitution book.

constitution_data1 <- gutenberg_works(title == "The United States Constitution") %>%
  gutenberg_download(meta_fields = "title")
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# Add a column called chapters based on the I,II, III in the book
constitution_data2<- constitution_data1 %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

constitution_data3 <- constitution_data2 %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
constitution_data3
## # A tibble: 7 × 3
##   index sentiment method
##   <dbl>     <dbl> <chr> 
## 1     0        24 AFINN 
## 2     1         8 AFINN 
## 3     2        15 AFINN 
## 4     3        13 AFINN 
## 5     4        19 AFINN 
## 6     5       -16 AFINN 
## 7     6        15 AFINN

chart negative and positive sentiment:

constitution_data3 %>%
ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE)

count the words in the book

count_constitution <- constitution_data2 %>%
  group_by(chapter) %>%
  count(word, sort = TRUE)
count_constitution
## # A tibble: 999 × 3
## # Groups:   chapter [1]
##    chapter word       n
##      <int> <chr>  <int>
##  1       0 the      421
##  2       0 of       298
##  3       0 and      193
##  4       0 shall    191
##  5       0 be       127
##  6       0 to       119
##  7       0 in        94
##  8       0 states    83
##  9       0 or        82
## 10       0 a         57
## # ℹ 989 more rows

Remove stop words

remove_constitution <- constitution_data2 %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
remove_constitution
## # A tibble: 1,714 × 5
##    gutenberg_id title                          linenumber chapter word     
##           <int> <chr>                               <int>   <int> <chr>    
##  1            5 The United States Constitution          3       0 original 
##  2            5 The United States Constitution          3       0 project  
##  3            5 The United States Constitution          3       0 gutenberg
##  4            5 The United States Constitution          3       0 etexts   
##  5            5 The United States Constitution          3       0 compiled 
##  6            5 The United States Constitution          3       0 file     
##  7            5 The United States Constitution          4       0 improve  
##  8            5 The United States Constitution          4       0 content  
##  9            5 The United States Constitution          4       0 ratios   
## 10            5 The United States Constitution          4       0 etext    
## # ℹ 1,704 more rows

Count the words in the book after removing stop words

remove_constitution %>%
  count(word, sort = TRUE)
## # A tibble: 780 × 2
##    word          n
##    <chr>     <int>
##  1 united       56
##  2 president    34
##  3 congress     29
##  4 house        23
##  5 law          23
##  6 section      22
##  7 office       19
##  8 senate       17
##  9 person       16
## 10 time         16
## # ℹ 770 more rows

Download the books from gutenberg package:

gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>

From the list of books I have selected The United States Constitution book for sentiment analysis.

constitution_data4 <- gutenberg_works(title == "The United States Constitution") %>%
  gutenberg_download(meta_fields = "title")


# Add a column called chapters based on the I,II, III in the book
constitution_data5<- constitution_data4 %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

constitution_data6<- constitution_data5 %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 100, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
constitution_data6
## # A tibble: 6 × 5
##   title                          index negative positive sentiment
##   <chr>                          <dbl>    <int>    <int>     <int>
## 1 The United States Constitution     0        3       11         8
## 2 The United States Constitution     1       13        8        -5
## 3 The United States Constitution     2        8       15         7
## 4 The United States Constitution     3       15       13        -2
## 5 The United States Constitution     4       20       16        -4
## 6 The United States Constitution     5        2        5         3

Bar diagram for negative and positive sentiment

ggplot(constitution_data6, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE)

count the words in the book

count_constitution <- constitution_data5 %>%
  group_by(chapter) %>%
  count(word, sort = TRUE)
count_constitution
## # A tibble: 999 × 3
## # Groups:   chapter [1]
##    chapter word       n
##      <int> <chr>  <int>
##  1       0 the      421
##  2       0 of       298
##  3       0 and      193
##  4       0 shall    191
##  5       0 be       127
##  6       0 to       119
##  7       0 in        94
##  8       0 states    83
##  9       0 or        82
## 10       0 a         57
## # ℹ 989 more rows

Remove stop words

remove_constitution <- constitution_data5 %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
remove_constitution
## # A tibble: 1,714 × 5
##    gutenberg_id title                          linenumber chapter word     
##           <int> <chr>                               <int>   <int> <chr>    
##  1            5 The United States Constitution          3       0 original 
##  2            5 The United States Constitution          3       0 project  
##  3            5 The United States Constitution          3       0 gutenberg
##  4            5 The United States Constitution          3       0 etexts   
##  5            5 The United States Constitution          3       0 compiled 
##  6            5 The United States Constitution          3       0 file     
##  7            5 The United States Constitution          4       0 improve  
##  8            5 The United States Constitution          4       0 content  
##  9            5 The United States Constitution          4       0 ratios   
## 10            5 The United States Constitution          4       0 etext    
## # ℹ 1,704 more rows

Count the words in the book after removing stop words

remove_constitution %>%
  count(word, sort = TRUE)
## # A tibble: 780 × 2
##    word          n
##    <chr>     <int>
##  1 united       56
##  2 president    34
##  3 congress     29
##  4 house        23
##  5 law          23
##  6 section      22
##  7 office       19
##  8 senate       17
##  9 person       16
## 10 time         16
## # ℹ 770 more rows

Get Positive and negative sentiment

constitution_data7 <- remove_constitution %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
constitution_data7
## # A tibble: 70 × 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 vice        negative      8
##  2 supreme     positive      7
##  3 treason     negative      7
##  4 inferior    negative      4
##  5 proper      positive      4
##  6 trust       positive      4
##  7 affirmation positive      3
##  8 debts       negative      3
##  9 objections  negative      3
## 10 resignation negative      3
## # ℹ 60 more rows

Bar diagram for both negative and positive sentiment

constitution_data7 %>%
  group_by(sentiment) %>%
slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

Wordcloud postive words

pos <- constitution_data7 %>%
  filter(sentiment == "positive")

wordcloud(
  words = pos$word,
  freq = pos$n,
  max.words = 100,
  colors = "blue")

Wordcloud negative words

neg <- constitution_data7 %>%
  filter(sentiment == "negative")

# Wordcloud Negative words
wordcloud(
  words = neg$word,
  freq = neg$n,
  max.words = 30,
  colors = "blue")

Download the text of The United States Constitution

Perform sentiment analysis with AFINN and Bing lexicons

The_United_States_Constitution <- gutenberg_works(title == "The United States Constitution") %>%
  gutenberg_download(meta_fields = "title")

# Add a linenumber column to keep track of the line numbers
The_United_States_Constitution <- The_United_States_Constitution %>%
  mutate(linenumber = row_number())

# Tokenize the text into words
The_United_States_Constitution_tokens <- The_United_States_Constitution %>%
  unnest_tokens(word, text)

# Perform sentiment analysis with AFINN and Bing lexicons
afinn_and_bing <- bind_rows(
  # AFINN method
  The_United_States_Constitution_tokens %>%
    inner_join(get_sentiments("afinn")) %>%
    mutate(method = "afinn"),
  
  # Bing method
  The_United_States_Constitution_tokens %>%
    inner_join(get_sentiments("bing") %>% 
                 filter(sentiment %in% c("positive", "negative"))) %>%
    mutate(method = "bing")
) %>%
  # Group by method, index (80-line chunks), and sentiment
  count(method, index = linenumber %/% 80, sentiment) %>%  # Count the occurrences of sentiments
  pivot_wider(names_from = sentiment, values_from = n, values_fill = list(n = 0)) %>%  # Pivoting data
  mutate(sentiment = positive - negative)  # Calculate sentiment (positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
# View the result
afinn_and_bing
## # A tibble: 14 × 6
##    method index  `NA` negative positive sentiment
##    <chr>  <dbl> <int>    <int>    <int>     <int>
##  1 afinn      0    22        0        0         0
##  2 afinn      1    25        0        0         0
##  3 afinn      2    34        0        0         0
##  4 afinn      3    48        0        0         0
##  5 afinn      4    35        0        0         0
##  6 afinn      5    40        0        0         0
##  7 afinn      6    24        0        0         0
##  8 bing       0     0        1       11        10
##  9 bing       1     0       11        6        -5
## 10 bing       2     0        9        7        -2
## 11 bing       3     0        6       12         6
## 12 bing       4     0       12       11        -1
## 13 bing       5     0       18       13        -5
## 14 bing       6     0        4        8         4
bind_rows(afinn_and_bing) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

Loughran sentiment

loughran_lexicon <- get_sentiments("loughran")

Download the book from gutenberg

gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
constitution_lou <- gutenberg_works(title == "The United States Constitution") %>%
  gutenberg_download(meta_fields = "title")


# Add a column called chapters based on the I,II, III in the book
constitution_lou2<- constitution_lou %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# table with sentiments
constitution_lou3 <- constitution_lou2 %>%
  inner_join(get_sentiments("loughran")) %>%
  count(title, index = linenumber %/% 100, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 848 of `x` matches multiple rows in `y`.
## ℹ Row 3179 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
constitution_lou3
## # A tibble: 7 × 9
##   title   index constraining litigious negative positive uncertainty superfluous
##   <chr>   <dbl>        <int>     <int>    <int>    <int>       <int>       <int>
## 1 The Un…     0            1        42        3        5           6           0
## 2 The Un…     1            3        74       21        2          10           0
## 3 The Un…     2            8        47        9        5           4           2
## 4 The Un…     3            1        59       11        5           7           2
## 5 The Un…     4            2        94       30        1           8           0
## 6 The Un…     5            3        41        3        0           2           0
## 7 The Un…     6            0         1        0        0           0           0
## # ℹ 1 more variable: sentiment <int>

Bar diagram of negative and positive sentiment for Loughran sentiment:

ggplot(constitution_lou3, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE)

Custom lexicon words

custom_lexicon <- loughran_lexicon %>%
  bind_rows(tribble(~word, ~sentiment,
                    "black", "negative",
                    "eyes", "positive"))

# look at new sentiment with customer lexicon
custom_words <- constitution_lou2 %>%
  inner_join(custom_lexicon) %>%
  count(word, sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
## Warning in inner_join(., custom_lexicon): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 848 of `x` matches multiple rows in `y`.
## ℹ Row 3179 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
custom_words
## # A tibble: 135 × 3
##    word         sentiment       n
##    <chr>        <chr>       <int>
##  1 shall        litigious     191
##  2 may          uncertainty    33
##  3 law          litigious      23
##  4 constitution litigious      15
##  5 thereof      litigious      12
##  6 laws         litigious      11
##  7 consent      litigious      10
##  8 legislature  litigious       9
##  9 against      negative        8
## 10 court        litigious       6
## # ℹ 125 more rows

Conclusion: Sentiment analysis is a technique used to understand the emotions and opinions expressed in texts.Sentiment analysis performs by organizing text data in a tidy structure, making it easier to apply methods like inner joins. With sentiment analysis, we can track how emotions evolve within a story or identify key words that express emotions and opinions. The chapter sets the stage for future case studies where different methods of sentiment analysis will be applied to various types of texts.