Assigment 10

In this assignment, you should start by getting the primary example code from chapter 2 working in an R Markdown document. You should provide a citation to this base code. You’re then asked to extend the code in two ways:

Setting up the example code from ‘Text Mining with R’ Chapter 2.

“2 Sentiment Analysis with Tidy Data.” Text Mining with R: a Tidy Approach, by Julia Silge and David Robinson, O’Reilly Media, 2017. https://www.tidytextmining.com/sentiment.html

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.3
library(dplyr)
get_sentiments("afinn")
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.3
library(dplyr)


library(stringr)

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
      ignore_case = TRUE
    )))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 303 x 2
##    word        n
##    <chr>   <int>
##  1 good      359
##  2 young     192
##  3 friend    166
##  4 hope      143
##  5 happy     125
##  6 love      117
##  7 deal       92
##  8 found      92
##  9 present    89
## 10 kind       82
## # ... with 293 more rows

Setting up New Corpus - .

we will use the function gutenberg_metadata from the package gutenbergr to check the id of the books we are interested in analyzing.

library(gutenbergr)
## Warning: package 'gutenbergr' was built under R version 4.0.3
gutenberg_metadata %>%
  filter(title == "Crime and Punishment")
## # A tibble: 1 x 8
##   gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
##          <int> <chr> <chr>             <int> <chr>    <chr>            <chr> 
## 1         2554 Crim~ Dosto~              314 en       Best Books Ever~ Publi~
## # ... with 1 more variable: has_text <lgl>

“Crime and Punishment” is 2554 for the English version.

crime_punishment <- gutenberg_download(2554)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
glimpse(crime_punishment)
## Rows: 22,061
## Columns: 2
## $ gutenberg_id <int> 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, ...
## $ text         <chr> "CRIME AND PUNISHMENT", "", "By Fyodor Dostoevsky", ""...

Tydying book

tydying_crime_punishment <- crime_punishment %>%
  slice(-c(1:102)) %>%
  mutate(line_num = row_number(),
         part = cumsum(str_detect(text, regex("^PART [\\divxlc]",
                                                  ignore_case = TRUE)))) %>%  
         group_by(part) %>%
         mutate(chapter = cumsum(str_detect(text, regex("^CHAPTER [\\divxlc]",
                                                          ignore_case = TRUE)))) %>% 
         ungroup()

glimpse(tydying_crime_punishment)
## Rows: 21,959
## Columns: 5
## $ gutenberg_id <int> 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, ...
## $ text         <chr> "PART I", "", "", "", "CHAPTER I", "", "On an exceptio...
## $ line_num     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
## $ part         <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ chapter      <int> 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...

creating a new column where each row has only one word

Book <- tydying_crime_punishment %>% 
  unnest_tokens(word, text) %>%
  mutate(word = str_replace(word, "_", ""))


#now removing stop words said
Text_CP <- Book %>%
  anti_join(stop_words, by = "word")

Analyzing Word Frequency

Text_CP%>%
  count(word, sort = TRUE) %>%
  top_n(10, n) %>%
  ggplot(aes(x = fct_reorder(word, n), y = n, fill = word)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_d(option = "viridis") +
  coord_flip() +
  xlab(NULL) +
  labs(title = "Crime and Punishment- Word Frequency") +
  theme_minimal()

The most used word more is the name of the main character, Raskolnikov.

Sentiment Analysis using nrc

Text_CP %>%
  inner_join(get_sentiments("nrc")) %>%
  count(index = line_num %/% 150, sentiment) %>% 
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  ggplot(aes(x = index, sentiment)) +
  geom_col(fill = "red", show.legend = FALSE) +
  labs(title = "Sentiment Analysis ") +
  theme_minimal()
## Joining, by = "word"

we can see that book has more negative sentiment than positive one

Text_CP %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment) %>%
  mutate(total = sum(n),
         prop = n / total) %>%
  ggplot(aes(fct_reorder(sentiment, prop), y = prop, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  scale_fill_viridis_d(option = "viridis") +
  xlab(NULL) +
  ggtitle("Sentiment Analysis ") +
  coord_flip() +
  theme_minimal()
## Joining, by = "word"

In conclusion, nrc package provides wider variety of words like negative and positive but as well other feeling assosiate with like joy and anger and so on.