UP_Week_10_Assignment

install.packages("tidytext", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages

library("tidytext")

bing<- get_sentiments("bing")

library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)


# Processing books written by Jane Austen. It groups the text by book, assigns line numbers, ungroups the data, and then tokenizes the text into individual words using the tidytext package

ttidy_books_austen <- austen_books() |>
  group_by(book) |>
  mutate(
    linenumber = row_number()
  ) |>
  ungroup() |>
  tidytext::unnest_tokens(word, text)

# Get joy sentiments from the bing lexicon
bing_joy <- get_sentiments("bing") |>
  filter(sentiment == "grief")

# Assuming 'tidy_books' is already defined
# Filter tidy_books for a specific book (e.g., "Sandition"), join with bing_joy sentiments, and count occurrences of each joy word
ttidy_books_austen |>
  filter(book == "Sandition") |>
  inner_join(bing_joy) |>
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 0 × 2
## # ℹ 2 variables: word <chr>, n <int>

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.2

jane_austen_sentiment <- ttidy_books_austen |>
  inner_join(get_sentiments("bing")) |>
  count(book, index = linenumber %/% 80, sentiment) |>
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

library(ggplot2)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

emma <- ttidy_books_austen |>
  filter(book == "Emma")

emma

## # A tibble: 160,996 × 3
##    book  linenumber word     
##    <fct>      <int> <chr>    
##  1 Emma           1 emma     
##  2 Emma           3 by       
##  3 Emma           3 jane     
##  4 Emma           3 austen   
##  5 Emma           8 volume   
##  6 Emma           8 i        
##  7 Emma          12 chapter  
##  8 Emma          12 i        
##  9 Emma          15 emma     
## 10 Emma          15 woodhouse
## # ℹ 160,986 more rows

# Perform sentiment analysis using the Bing lexicon on "Pride and Prejudice"
emmabing <- emma |>
  inner_join(get_sentiments("bing")) |>
  mutate(method = "Bing")

## Joining with `by = join_by(word)`

## Warning in inner_join(emma, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 32813 of `x` matches multiple rows in `y`.
## ℹ Row 4099 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Combine sentiment analyses into one dataframe
combined_sentiments <-emmabing |>
  count(method, index = linenumber %/% 80, sentiment) |>
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
  mutate(sentiment = positive - negative)

# Combine sentiment analyses using Bing lexicon for "Pride and Prejudice"
bing_and_nrc <- bind_rows(
  # Sentiment analysis using the Bing lexicon
  emma |>
    inner_join(get_sentiments("bing")) |>
    mutate(method = "Bing")
) |>
  count(method, index = linenumber %/% 80, sentiment) |>
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(emma, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 32813 of `x` matches multiple rows in `y`.
## ℹ Row 4099 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(bing_and_nrc) |>
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

bing_word_counts <- ttidy_books_austen |>
  inner_join(get_sentiments("bing")) |>
  count(word, sentiment, sort = TRUE) |>
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Create a tibble of custom words
custom_start_words <- tibble(
  word = c("start_word1", "start_word2"),  # Add your custom start words here
  lexicon = "custom"
)

# Combine custom start words with existing stop words
custom_stop_words <- bind_rows(custom_start_words, stop_words)

custom_stop_words

## # A tibble: 1,151 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 start_word1 custom 
##  2 start_word2 custom 
##  3 a           SMART  
##  4 a's         SMART  
##  5 able        SMART  
##  6 about       SMART  
##  7 above       SMART  
##  8 according   SMART  
##  9 accordingly SMART  
## 10 across      SMART  
## # ℹ 1,141 more rows

install.packages("wordcloud", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages

library("wordcloud")

## Loading required package: RColorBrewer

ttidy_books_austen |>
  anti_join(stop_words) |>
  count(word) |>
  with(wordcloud(word, n, max.words = 50))

## Joining with `by = join_by(word)`

install.packages("tm", repos = "http://cran.us.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages

library("tm")

## Warning: package 'tm' was built under R version 4.3.2

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)


# Perform sentiment analysis using the Bing lexicon and count occurrences of each word
word_counts <- ttidy_books_austen |>
  inner_join(get_sentiments("bing")) |>
  count(word, sentiment, sort = TRUE)

## Joining with `by = join_by(word)`

## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Reshape the data to create a matrix with words as rows and sentiments as columns
word_matrix <- xtabs(n ~ word + sentiment, data = word_counts)

# Generate a word cloud comparison
comparison.cloud(word_matrix, colors = c("blue", "green"), max.words = 100)

# Filter Bing lexicon for negative sentiments
bing_negative <- get_sentiments("bing") |> 
  filter(sentiment == "negative")

# Count words per book and chapter
word_counts <- ttidy_books_austen |>
  group_by(book) |>
  summarize(words = n())

# Count negative words per book and chapter
negative_word_counts <- ttidy_books_austen |>
  semi_join(bing_negative) |>
  group_by(book) |>
  summarize(negativewords = n())

## Joining with `by = join_by(word)`

UP_Week_10_Assignment

Ursula Podosenin

2024-03-29