test

Author

lena

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(wordcloud2)
library(quanteda)
Package version: 3.3.1
Unicode version: 14.0
ICU version: 71.1
Parallel computing: 8 of 8 threads used.
See https://quanteda.io for tutorials and examples.
library(tidytext)
hunger_games<-read_csv('hunger_games.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 528 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
catching_fire<-read_csv('catching_fire.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 9771 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mockingjay<-read_csv('mockingjay.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 4918 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
divergent<-read_csv('divergent.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 9932 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
allegiant<-read_csv('allegiant.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 13121 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
insurgent<-read_csv('insurgent.txt', col_names=FALSE)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
Rows: 5244 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hunger_games |> 
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> hg_bigram

hg_bigram |> 
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) ->hg_bigrams2


catching_fire |>
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> cf_bigram

cf_bigram |> 
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) ->cf_bigrams2


mockingjay |>
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> mj_bigram

mj_bigram |>
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) ->mj_bigrams2


hg_bigrams2 |> 
  full_join(cf_bigrams2) |> 
  full_join(mj_bigrams2) -> hgseries_bigrams
Joining with `by = join_by(word1, word2, n)`
Joining with `by = join_by(word1, word2, n)`
hgseries_bigrams |> 
  filter(word2 %in% 'peeta')
# A tibble: 333 × 3
   word1    word2     n
   <chr>    <chr> <int>
 1 hear     peeta     3
 2 katniss  peeta     3
 3 cheese   peeta     2
 4 choice   peeta     2
 5 haymitch peeta     2
 6 home     peeta     2
 7 kill     peeta     2
 8 leave    peeta     2
 9 losing   peeta     2
10 matter   peeta     2
# ℹ 323 more rows
hgseries_bigrams |> 
  filter(word1 %in% 'peeta')
# A tibble: 269 × 3
   word1 word2       n
   <chr> <chr>   <int>
 1 peeta mellark    17
 2 peeta hes         5
 3 peeta pulls       4
 4 peeta shakes      4
 5 peeta begins      3
 6 peeta holds       3
 7 peeta raises      3
 8 peeta takes       3
 9 peeta cry         2
10 peeta doesnt      2
# ℹ 259 more rows
hgseries_bigrams |> 
  filter(word2 %in% 'katniss')
# A tibble: 93 × 3
   word1           word2       n
   <chr>           <chr>   <int>
 1 dont            katniss     2
 2 katniss         katniss     2
 3 luck            katniss     2
 4 arena           katniss     1
 5 audience        katniss     1
 6 belt            katniss     1
 7 calling         katniss     1
 8 cannon          katniss     1
 9 congratulations katniss     1
10 dreamily        katniss     1
# ℹ 83 more rows
hgseries_bigrams |> 
  filter(word1 %in% 'katniss')
# A tibble: 32 × 3
   word1   word2         n
   <chr>   <chr>     <int>
 1 katniss everdeen      6
 2 katniss peeta         3
 3 katniss katniss       2
 4 katniss cinna         1
 5 katniss everdeens     1
 6 katniss im            1
 7 katniss remember      1
 8 katniss roots         1
 9 katniss rue           1
10 katniss rues          1
# ℹ 22 more rows
divergent |> 
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> divergent_bigram
divergent_bigram |>
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) ->divergent_bigrams2





insurgent |> 
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> insurgent_bigram
divergent_bigram |>
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) ->insurgent_bigrams2



allegiant |> 
  unnest_tokens(bigram, X1, token="ngrams", n=2) -> allegiant_bigram

allegiant_bigram |> 
  separate(bigram, c('word1', 'word2'),sep=" ") |> 
  filter(!word1%in% stop_words$word) |>
  filter(!word2%in% stop_words$word) |>
  count(word1, word2, sort=TRUE) -> allegiant_bigrams2




divergent_bigrams2 |> 
  full_join(insurgent_bigrams2) |> 
  full_join(allegiant_bigrams2) -> divergentseries_bigrams
Joining with `by = join_by(word1, word2, n)`
Joining with `by = join_by(word1, word2, n)`
divergentseries_bigrams |> 
  filter(word2 %in% 'tris')
# A tibble: 68 × 3
   word1           word2     n
   <chr>           <chr> <int>
 1 careful         tris      3
 2 1               tris      1
 3 6               tris      1
 4 ahead           tris      1
 5 boyfriend       tris      1
 6 brave           tris      1
 7 congratulations tris      1
 8 hallucination   tris      1
 9 hey             tris      1
10 human           tris      1
# ℹ 58 more rows
divergentseries_bigrams |> 
  filter(word1 %in% 'tris')
# A tibble: 43 × 3
   word1 word2         n
   <chr> <chr>     <int>
 1 tris  al            1
 2 tris  climbed       1
 3 tris  lauren        1
 4 tris  tobias        1
 5 tris  uriah         1
 6 tris  christina     3
 7 tris  nita          3
 8 tris  tobias        3
 9 tris  sits          2
10 tris  stands        2
# ℹ 33 more rows
divergentseries_bigrams |> 
  filter(word1 %in% 'tobias')
# A tibble: 69 × 3
   word1  word2         n
   <chr>  <chr>     <int>
 1 tobias walks         3
 2 tobias holding       2
 3 tobias beatrice      1
 4 tobias brushing      1
 5 tobias caleb         1
 6 tobias candor        1
 7 tobias clenching     1
 8 tobias closes        1
 9 tobias crouches      1
10 tobias cry           1
# ℹ 59 more rows
divergentseries_bigrams |> 
  filter(word1 %in% 'tobias')
# A tibble: 69 × 3
   word1  word2         n
   <chr>  <chr>     <int>
 1 tobias walks         3
 2 tobias holding       2
 3 tobias beatrice      1
 4 tobias brushing      1
 5 tobias caleb         1
 6 tobias candor        1
 7 tobias clenching     1
 8 tobias closes        1
 9 tobias crouches      1
10 tobias cry           1
# ℹ 59 more rows
hunger_games |> 
  unnest_tokens (word, X1)|> 
  anti_join(stop_words) |> 
  count(word, sort= TRUE) |> 
  mutate(Book = "The Hunger Games") -> hunger_games_clean
Joining with `by = join_by(word)`
catching_fire |> 
  unnest_tokens (word, X1) |> 
  anti_join(stop_words) |> 
  count(word, sort =TRUE) |> 
  mutate(Book="Catching Fire") -> catchingfire_clean
Joining with `by = join_by(word)`
mockingjay |> 
  unnest_tokens(word, X1) |> 
  anti_join(stop_words) |> 
  count(word, sort=TRUE) |> 
  mutate(Book="Mockingjay") -> mockingjay_clean
Joining with `by = join_by(word)`
hunger_games_clean |> 
  full_join(catchingfire_clean) |> 
  full_join(mockingjay_clean)  |> 
  mutate(Series = "The Hunger Games") -> hunger_games_books
Joining with `by = join_by(word, n, Book)`
Joining with `by = join_by(word, n, Book)`
divergent |> 
  unnest_tokens(word,X1) |> 
  anti_join(stop_words) |> 
  count(word, sort=TRUE) |> 
  mutate(Book="divergent") -> divergent_clean
Joining with `by = join_by(word)`
insurgent |> 
  unnest_tokens(word, X1) |> 
  anti_join(stop_words) |> 
  count(word, sort=TRUE) |> 
  mutate(Book="insurgent") -> insurgent_clean
Joining with `by = join_by(word)`
allegiant |> 
  unnest_tokens(word, X1) |> 
  anti_join(stop_words) |> 
  count(word, sort =TRUE ) |> 
  mutate(Book = 'allegiant') -> allegiant_clean
Joining with `by = join_by(word)`
divergent_clean |> 
  full_join(insurgent_clean) |> 
  full_join(allegiant_clean) |> 
  mutate(Series = "Divergent") -> divergent_books
Joining with `by = join_by(word, n, Book)`
Joining with `by = join_by(word, n, Book)`
hunger_games_books |> 
  full_join(divergent_books) -> merged
Joining with `by = join_by(word, n, Book, Series)`
merged |>
  mutate(word = gsub(pattern = '[[:punct:]]', replacement = '', word) ) -> merged
merged |>
  inner_join(get_sentiments('afinn')) |>
  group_by(Book) |>
  summarize(average = mean(value)) |>
  ggplot(aes(Book, average, fill = average)) + geom_col() + coord_flip() + theme_classic() +
  labs(x='Book', y='Average', title = "Sentiment Analysis of The Hunger Games and Divergent Series" )
Joining with `by = join_by(word)`

merged |> 
  arrange(desc(n)) |> 
  filter(!word %in% c("tris","tobias","peeta","im", 
                "haymitch","gale","finnick","dont","christina", "its", "im", "dont", "it's", "i'm", "hand", 
                "nita", "doesnt", "caleb", "eric", "cant", "plutarch", 
                "marcus", "jeanine", "ive", "hes", "al", "cara", "peter", "cara", 
                "snow", "katniss", "rue", "mathew", "ing", "didnt", "boggs", "matthew", "evelyn", "david")) |> 
  head(65) |>
  ggplot(aes(word, n, fill=word)) + geom_col() + coord_flip() + theme_classic() +
  labs( x = "Word", y = "Frequency of Word ", title = "Most Common Overlapping Words in The Hunger Games and Divergent Series")

hunger_games_books |>
  arrange(desc(n)) |> 
  filter(!word %in% c("peeta","im", 
                      "haymitch","gale","finnick","dont","prim", "its", "im", "dont", "it's", "i'm", "hand", "katniss", 
                      "plutarch", "ill", "hes", "cato", "ive", "rue", "snow", "cinna", "shes", "beetee", "johanna", "boggs" , "peeta's")) |> 
  head(40) |>
  ggplot(aes(word, n, fill=word)) + geom_col() + coord_flip() + theme_classic()+
  labs( x = "Word", y="Number of Times the Word Was in the Series", 
        title = "Top Words in The Hunger Games Series")

divergent_books |>
  mutate(word = gsub(pattern = '[[:punct:]]', replacement = '', word) ) -> divergent_books

divergent_books |> 
  arrange(desc(n)) |> 
  filter(! word %in% c("tris", "tobias", "nita", "marcus", "jeanine", 
                       "it's", "i'm", "don't", "can't", "christina", "im", "its", "dont", "cant", 
                       "al", "caleb", "cara", "eric", "doesnt", "matthew", "peter", "ing", "didnt", "hands")) |> 
  head(40) |>
  ggplot(aes(word, n, fill=word)) + geom_col() + coord_flip() + theme_classic() + 
  labs(y= "Number of Times the Word was in the Series ", x="Word", title = "Top Words in The Divergent Series")