library(tidytext)

get_sentiments("afinn")

A tibble: 2,477 × 2

word value 1 abandon -2 2 abandoned -2 3 abandons -2 4 abducted -2 5 abduction -2 6 abductions -2 7 abhor -3 8 abhorred -3 9 abhorrent -3 10 abhors -3 # ℹ 2,467 more rows

library(tidytext)

get_sentiments("bing")

A tibble: 6,786 × 2

word sentiment
1 2-faces negative 2 abnormal negative 3 abolish negative 4 abominable negative 5 abominably negative 6 abominate negative 7 abomination negative 8 abort negative 9 aborted negative 10 aborts negative # ℹ 6,776 more rows

library(tidytext)

get_sentiments("nrc")

A tibble: 13,872 × 2

word sentiment
1 abacus trust
2 abandon fear
3 abandon negative 4 abandon sadness
5 abandoned anger
6 abandoned fear
7 abandoned negative 8 abandoned sadness
9 abandonment anger
10 abandonment fear
# ℹ 13,862 more rows

library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)


tidy_books

A tibble: 725,064 × 4

book linenumber chapter word

1 Sense & Sensibility 1 0 sense
2 Sense & Sensibility 1 0 and
3 Sense & Sensibility 1 0 sensibility 4 Sense & Sensibility 3 0 by
5 Sense & Sensibility 3 0 jane
6 Sense & Sensibility 3 0 austen
7 Sense & Sensibility 5 0 1811
8 Sense & Sensibility 10 1 chapter
9 Sense & Sensibility 10 1 1
10 Sense & Sensibility 13 1 the
# ℹ 725,054 more rows

#austen_books()

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

A tibble: 301 × 2

word n 1 good 359 2 friend 166 3 hope 143 4 happy 125 5 love 117 6 deal 92 7 found 92 8 present 89 9 kind 82 10 happiness 76 # ℹ 291 more rows

library(tidyr)

a <- get_sentiments("bing")
a

A tibble: 6,786 × 2

jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435443 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Getting the corpus and using the afinn sentiment

library(tidytext)
library(textdata)
library(gutenbergr)

#get_sentiments("nrc")

gutenberg_works(author =="Twain, Mark") %>% filter(gutenberg_id == "7100")

A tibble: 1 × 8

gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
1 7100 Adventur… Twain… 53 en “”
# ℹ 2 more variables: rights , has_text

book <- gutenberg_download(7100)$text

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

book <- as.data.frame(book)

book <- book %>% mutate(rownumber = row_number()) %>% ungroup() %>% unnest_tokens(word,book)

Doing the sentiment analysis. The following are words that associated with fear.

library(tidytext)
library(textdata)
library(gutenbergr)

nrc_tired <- get_sentiments("nrc") %>% filter(sentiment == "fear")

twain_tidy <- book %>% inner_join(nrc_tired)

## Joining with `by = join_by(word)`

twain_tidy

rownumber        word sentiment

1 12 banished fear 2 13 shot fear 3 15 ordnance fear 4 58 awful fear 5 63 dismal fear 6 71 harm fear 7 72 sweat fear 8 73 sweat fear 9 83 sweat fear 10 101 deadly fear 11 105 bad fear 12 106 mad fear 13 106 harm fear 14 107 change fear 15 108 wicked fear 16 127 mournful fear 17 129 die fear 18 132 ghost fear 19 134 grave fear 20 136 spider fear 21 138 awful fear 22 138 bad fear 23 139 bad fear 24 142 confidence fear 25 144 bad fear 26 145 spider fear 27 147 shaking fear 28 148 death fear 29 179 die fear 30 204 disturbance fear 31 209 sweat fear 32 223 death fear 33 229 fire fear 34 233 devil fear 35 238 devil fear 36 239 ruined fear 37 240 devil fear 38 246 awful fear 39 249 scar fear 40 260 gang fear 41 267 kill fear 42 269 cross fear 43 273 carcass fear 44 275 gang fear 45 275 curse fear 46 280 robber fear 47 280 gang fear 48 282 kill fear 49 295 rule fear 50 296 kill fear 51 300 kill fear 52 307 gang fear 53 309 robbery fear 54 309 murder fear 55 311 rob fear 56 313 stealing fear 57 313 robbery fear 58 316 kill fear 59 318 kill fear 60 321 kill fear 61 343 death fear 62 347 guard fear 63 348 shoot fear 64 350 guard fear 65 351 watch fear 66 352 ransom fear 67 359 ransom fear 68 362 kill fear 69 364 kill fear 70 371 mighty fear 71 377 robber fear 72 380 mad fear 73 382 rob fear 74 382 kill fear 75 385 wicked fear 76 388 gang fear 77 400 scold fear 78 403 pray fear 79 412 pray fear 80 423 worry fear 81 443 buried fear 82 445 mighty fear 83 451 robber fear 84 460 gang fear 85 464 guard fear 86 466 kill fear 87 477 busted fear 88 479 tract fear 89 488 infant fear 90 504 lightning fear 91 506 shot fear 92 515 marry fear 93 528 highest fear 94 536 sweat fear 95 556 hiding fear 96 568 bad fear 97 571 bad fear 98 572 feeling fear 99 572 shaky fear 100 574 bad fear 101 576 watch fear 102 585 cross fear 103 586 devil fear 104 648 bad fear 105 669 hurt fear 106 673 marry fear 107 687 mistaken fear 108 689 unexpected fear 109 699 busted fear 110 708 bug fear 111 733 swelling fear 112 756 growling fear 113 785 shell fear 114 798 force fear 115 800 court fear 116 812 court fear 117 832 die fear 118 840 powerful fear 119 842 rod fear 120 844 broke fear 121 845 death fear

twain_tidy %>%count(word)

      word  n

1 awful 3 2 bad 8 3 banished 1 4 broke 1 5 bug 1 6 buried 1 7 busted 2 8 carcass 1 9 change 1 10 confidence 1 11 court 2 12 cross 2 13 curse 1 14 deadly 1 15 death 4 16 devil 4 17 die 3 18 dismal 1 19 disturbance 1 20 feeling 1 21 fire 1 22 force 1 23 gang 6 24 ghost 1 25 grave 1 26 growling 1 27 guard 3 28 harm 2 29 hiding 1 30 highest 1 31 hurt 1 32 infant 1 33 kill 11 34 lightning 1 35 mad 2 36 marry 2 37 mighty 2 38 mistaken 1 39 mournful 1 40 murder 1 41 ordnance 1 42 powerful 1 43 pray 2 44 ransom 2 45 rob 2 46 robber 3 47 robbery 2 48 rod 1 49 ruined 1 50 rule 1 51 scar 1 52 scold 1 53 shaking 1 54 shaky 1 55 shell 1 56 shoot 1 57 shot 2 58 spider 2 59 stealing 1 60 sweat 5 61 swelling 1 62 tract 1 63 unexpected 1 64 watch 2 65 wicked 2 66 worry 1

library(ggplot2)

a <- twain_tidy %>%count(word)

a %>% filter(n == max(a$n))

word n 1 kill 11

I was having issue plot the plot so for the analysis, I would trying to look at the most use word in the story which to attributed the author’s favorite word while writing the story

Text Mining

Atta Boateng

03-31-2024

A tibble: 2,477 × 2

A tibble: 6,786 × 2

A tibble: 13,872 × 2

A tibble: 725,064 × 4

A tibble: 301 × 2

A tibble: 6,786 × 2

A tibble: 1 × 8