Assignment 10

In Text Mining with R, Chapter 2 looks at Sentiment Analysis. In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

Work with a different corpus of your choosing, and Incorporate at least one additional sentiment lexicon (possibly from another R package that you’ve found through research). As usual, please submit links to both an .Rmd file posted in your GitHub repository and to your code on rpubs.com. You make work on a small team on this assignment.

library(tidyverse)
library(tidytext)
library(janeaustenr)
library(wordcloud)
library(gutenbergr)

get_sentiments("nrc")
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows
get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

AFINN Sentiment

#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
dark <- gutenberg_works(title == "Heart of Darkness") %>%
  gutenberg_download(meta_fields = "title")

# Add a column called chapters based on the I,II, III in the book
dark2<- dark %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

dark3 <- dark2 %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")
dark3
## # A tibble: 42 × 3
##    index sentiment method
##    <dbl>     <dbl> <chr> 
##  1     0        24 AFINN 
##  2     1        18 AFINN 
##  3     2        -6 AFINN 
##  4     3       -20 AFINN 
##  5     4        41 AFINN 
##  6     5        47 AFINN 
##  7     6       -14 AFINN 
##  8     7       -35 AFINN 
##  9     8        11 AFINN 
## 10     9       -25 AFINN 
## # ℹ 32 more rows
# chart negative and positive sentiment
dark3 %>%
ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) 

# count the words in the book
count_dark <- dark2 %>%
  group_by(chapter) %>%
  count(word, sort = TRUE)
count_dark
## # A tibble: 8,379 × 3
## # Groups:   chapter [4]
##    chapter word      n
##      <int> <chr> <int>
##  1       1 the     868
##  2       2 the     778
##  3       3 the     646
##  4       1 a       532
##  5       1 of      520
##  6       3 i       456
##  7       3 of      432
##  8       2 of      421
##  9       1 and     392
## 10       1 i       352
## # ℹ 8,369 more rows
#remove stop words
remove_dark <- dark2 %>%
  anti_join(stop_words)
remove_dark
## # A tibble: 12,943 × 5
##    gutenberg_id title             linenumber chapter word    
##           <int> <chr>                  <int>   <int> <chr>   
##  1          526 Heart of Darkness          1       0 note    
##  2          526 Heart of Darkness          1       0 etext   
##  3          526 Heart of Darkness          1       0 219     
##  4          526 Heart of Darkness          1       0 version 
##  5          526 Heart of Darkness          1       0 ebook   
##  6          526 Heart of Darkness          3       0 heart   
##  7          526 Heart of Darkness          3       0 darkness
##  8          526 Heart of Darkness          6       0 joseph  
##  9          526 Heart of Darkness          6       0 conrad  
## 10          526 Heart of Darkness         12       1 nellie  
## # ℹ 12,933 more rows
#count the words in the book after removing stop words
remove_dark %>%
  count(word, sort = TRUE)
## # A tibble: 4,978 × 2
##    word        n
##    <chr>   <int>
##  1 kurtz     101
##  2 time       78
##  3 river      65
##  4 looked     56
##  5 eyes       49
##  6 station    48
##  7 head       47
##  8 heard      47
##  9 black      43
## 10 manager    42
## # ℹ 4,968 more rows
# create custom words to remove ("time")
custom_stop_words <- bind_rows(tibble(word = c("time"),  
                                      lexicon = c("custom")), 
                               stop_words)
custom_stop_words
## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 time        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows
# remove time from word list
remove_dark %>%
  anti_join(custom_stop_words) %>%
  count(word, sort = TRUE)
## # A tibble: 4,977 × 2
##    word        n
##    <chr>   <int>
##  1 kurtz     101
##  2 river      65
##  3 looked     56
##  4 eyes       49
##  5 station    48
##  6 head       47
##  7 heard      47
##  8 black      43
##  9 manager    42
## 10 earth      39
## # ℹ 4,967 more rows

BING Sentiment

#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
heart <- gutenberg_works(title == "Heart of Darkness") %>%
  gutenberg_download(meta_fields = "title")


# Add a column called chapters based on the I,II, III in the book
heart2<- heart %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

heart3 <- heart2 %>%
  inner_join(get_sentiments("bing")) %>%
  count(title, index = linenumber %/% 100, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
heart3
## # A tibble: 34 × 5
##    title             index negative positive sentiment
##    <chr>             <dbl>    <int>    <int>     <int>
##  1 Heart of Darkness     0       34       40         6
##  2 Heart of Darkness     1       51       38       -13
##  3 Heart of Darkness     2       38       39         1
##  4 Heart of Darkness     3       36       44         8
##  5 Heart of Darkness     4       48       46        -2
##  6 Heart of Darkness     5       60       29       -31
##  7 Heart of Darkness     6       56       24       -32
##  8 Heart of Darkness     7       57       36       -21
##  9 Heart of Darkness     8       48       38       -10
## 10 Heart of Darkness     9       31       34         3
## # ℹ 24 more rows
# chart negative and positive sentiment
ggplot(heart3, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) 

# count the words in the book
count_heart <- heart2 %>%
  group_by(chapter) %>%
  count(word, sort = TRUE)
count_heart
## # A tibble: 8,379 × 3
## # Groups:   chapter [4]
##    chapter word      n
##      <int> <chr> <int>
##  1       1 the     868
##  2       2 the     778
##  3       3 the     646
##  4       1 a       532
##  5       1 of      520
##  6       3 i       456
##  7       3 of      432
##  8       2 of      421
##  9       1 and     392
## 10       1 i       352
## # ℹ 8,369 more rows
# remove stop words
remove_heart <- heart2 %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
remove_heart
## # A tibble: 12,943 × 5
##    gutenberg_id title             linenumber chapter word    
##           <int> <chr>                  <int>   <int> <chr>   
##  1          526 Heart of Darkness          1       0 note    
##  2          526 Heart of Darkness          1       0 etext   
##  3          526 Heart of Darkness          1       0 219     
##  4          526 Heart of Darkness          1       0 version 
##  5          526 Heart of Darkness          1       0 ebook   
##  6          526 Heart of Darkness          3       0 heart   
##  7          526 Heart of Darkness          3       0 darkness
##  8          526 Heart of Darkness          6       0 joseph  
##  9          526 Heart of Darkness          6       0 conrad  
## 10          526 Heart of Darkness         12       1 nellie  
## # ℹ 12,933 more rows
#count the words in the book after removing stop words
remove_heart %>%
  count(word, sort = TRUE)
## # A tibble: 4,978 × 2
##    word        n
##    <chr>   <int>
##  1 kurtz     101
##  2 time       78
##  3 river      65
##  4 looked     56
##  5 eyes       49
##  6 station    48
##  7 head       47
##  8 heard      47
##  9 black      43
## 10 manager    42
## # ℹ 4,968 more rows
# get sentiment
heart4 <- remove_heart %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
heart4
## # A tibble: 1,121 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 darkness negative     26
##  2 lost     negative     26
##  3 dark     negative     25
##  4 dead     negative     23
##  5 death    negative     19
##  6 slowly   negative     18
##  7 devil    negative     15
##  8 fool     negative     14
##  9 lying    negative     14
## 10 doubt    negative     13
## # ℹ 1,111 more rows
#chart both negative and positive sentiment
heart4 %>%
  group_by(sentiment) %>%
slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

# Wordcloud postive words
pos <- heart4 %>%
  filter(sentiment == "positive")

wordcloud(
  words = pos$word,
  freq = pos$n,
  max.words = 30,
  colors = "blue")
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): profound could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): remarkable could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = pos$word, freq = pos$n, max.words = 30, colors =
## "blue"): extraordinary could not be fit on page. It will not be plotted.

neg <- heart4 %>%
  filter(sentiment == "negative")

# Wordcloud Negative words
wordcloud(
  words = neg$word,
  freq = neg$n,
  max.words = 30,
  colors = "blue")

Loughran sentiment

#download the book from gutenberg
gutenberg_metadata
## # A tibble: 72,569 × 8
##    gutenberg_id title    author gutenberg_author_id language gutenberg_bookshelf
##           <int> <chr>    <chr>                <int> <chr>    <chr>              
##  1            1 "The De… Jeffe…                1638 en       "Politics/American…
##  2            2 "The Un… Unite…                   1 en       "Politics/American…
##  3            3 "John F… Kenne…                1666 en       ""                 
##  4            4 "Lincol… Linco…                   3 en       "US Civil War"     
##  5            5 "The Un… Unite…                   1 en       "United States/Pol…
##  6            6 "Give M… Henry…                   4 en       "American Revoluti…
##  7            7 "The Ma… <NA>                    NA en       ""                 
##  8            8 "Abraha… Linco…                   3 en       "US Civil War"     
##  9            9 "Abraha… Linco…                   3 en       "US Civil War"     
## 10           10 "The Ki… <NA>                    NA en       "Banned Books List…
## # ℹ 72,559 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
lou <- gutenberg_works(title == "Heart of Darkness") %>%
  gutenberg_download(meta_fields = "title")


# Add a column called chapters based on the I,II, III in the book
lou2<- lou %>%
  mutate( linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^([\\divxlc])+$", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

# table with sentiments
lou3 <- lou2 %>%
  inner_join(get_sentiments("loughran")) %>%
  count(title, index = linenumber %/% 100, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("loughran")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 285 of `x` matches multiple rows in `y`.
## ℹ Row 2526 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
lou3
## # A tibble: 34 × 8
##    title    index constraining litigious negative positive uncertainty sentiment
##    <chr>    <dbl>        <int>     <int>    <int>    <int>       <int>     <int>
##  1 Heart o…     0            3         3        7       12           7         5
##  2 Heart o…     1            0         0       17        7          13       -10
##  3 Heart o…     2            0         1       15        7           9        -8
##  4 Heart o…     3            1         5       12        9           7        -3
##  5 Heart o…     4            0         1        7       16          13         9
##  6 Heart o…     5            0         2       11        4          13        -7
##  7 Heart o…     6            0         6       21        5          12       -16
##  8 Heart o…     7            1         1       22        8          10       -14
##  9 Heart o…     8            0         2       17        9          18        -8
## 10 Heart o…     9            1         0       12       10          15        -2
## # ℹ 24 more rows
#add sentiment to loughran
loughran_lexicon <- get_sentiments("loughran")


# chart negative and positive sentiment
ggplot(lou3, aes(index, sentiment, fill = title)) +
  geom_col(show.legend = FALSE) 

#custom lexicon words
custom_lexicon <- loughran_lexicon %>%
  bind_rows(tribble(~word, ~sentiment,
                    "black", "negative",
                    "eyes", "positive"))
# look at new sentiment with customer lexicon
custom_words <- lou2 %>%
  inner_join(custom_lexicon) %>%
  count(word, sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
## Warning in inner_join(., custom_lexicon): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 285 of `x` matches multiple rows in `y`.
## ℹ Row 2526 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
custom_words
## # A tibble: 421 × 3
##    word      sentiment       n
##    <chr>     <chr>       <int>
##  1 could     uncertainty   111
##  2 eyes      positive       49
##  3 great     positive       46
##  4 black     negative       43
##  5 good      positive       42
##  6 suddenly  uncertainty    35
##  7 perhaps   uncertainty    29
##  8 lost      negative       26
##  9 sometimes uncertainty    24
## 10 believe   uncertainty    22
## # ℹ 411 more rows