install.packages("tidytext", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages
library("tidytext")
bing<- get_sentiments("bing")
library(janeaustenr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
# Processing books written by Jane Austen. It groups the text by book, assigns line numbers, ungroups the data, and then tokenizes the text into individual words using the tidytext package
ttidy_books_austen <- austen_books() |>
group_by(book) |>
mutate(
linenumber = row_number()
) |>
ungroup() |>
tidytext::unnest_tokens(word, text)
# Get joy sentiments from the bing lexicon
bing_joy <- get_sentiments("bing") |>
filter(sentiment == "grief")
# Assuming 'tidy_books' is already defined
# Filter tidy_books for a specific book (e.g., "Sandition"), join with bing_joy sentiments, and count occurrences of each joy word
ttidy_books_austen |>
filter(book == "Sandition") |>
inner_join(bing_joy) |>
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 0 × 2
## # ℹ 2 variables: word <chr>, n <int>
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.2
jane_austen_sentiment <- ttidy_books_austen |>
inner_join(get_sentiments("bing")) |>
count(book, index = linenumber %/% 80, sentiment) |>
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")

emma <- ttidy_books_austen |>
filter(book == "Emma")
emma
## # A tibble: 160,996 × 3
## book linenumber word
## <fct> <int> <chr>
## 1 Emma 1 emma
## 2 Emma 3 by
## 3 Emma 3 jane
## 4 Emma 3 austen
## 5 Emma 8 volume
## 6 Emma 8 i
## 7 Emma 12 chapter
## 8 Emma 12 i
## 9 Emma 15 emma
## 10 Emma 15 woodhouse
## # ℹ 160,986 more rows
# Perform sentiment analysis using the Bing lexicon on "Pride and Prejudice"
emmabing <- emma |>
inner_join(get_sentiments("bing")) |>
mutate(method = "Bing")
## Joining with `by = join_by(word)`
## Warning in inner_join(emma, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 32813 of `x` matches multiple rows in `y`.
## ℹ Row 4099 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Combine sentiment analyses into one dataframe
combined_sentiments <-emmabing |>
count(method, index = linenumber %/% 80, sentiment) |>
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
mutate(sentiment = positive - negative)
# Combine sentiment analyses using Bing lexicon for "Pride and Prejudice"
bing_and_nrc <- bind_rows(
# Sentiment analysis using the Bing lexicon
emma |>
inner_join(get_sentiments("bing")) |>
mutate(method = "Bing")
) |>
count(method, index = linenumber %/% 80, sentiment) |>
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) |>
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Warning in inner_join(emma, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 32813 of `x` matches multiple rows in `y`.
## ℹ Row 4099 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
bind_rows(bing_and_nrc) |>
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")

bing_word_counts <- ttidy_books_austen |>
inner_join(get_sentiments("bing")) |>
count(word, sentiment, sort = TRUE) |>
ungroup()
## Joining with `by = join_by(word)`
## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Create a tibble of custom words
custom_start_words <- tibble(
word = c("start_word1", "start_word2"), # Add your custom start words here
lexicon = "custom"
)
# Combine custom start words with existing stop words
custom_stop_words <- bind_rows(custom_start_words, stop_words)
custom_stop_words
## # A tibble: 1,151 × 2
## word lexicon
## <chr> <chr>
## 1 start_word1 custom
## 2 start_word2 custom
## 3 a SMART
## 4 a's SMART
## 5 able SMART
## 6 about SMART
## 7 above SMART
## 8 according SMART
## 9 accordingly SMART
## 10 across SMART
## # ℹ 1,141 more rows
install.packages("wordcloud", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages
library("wordcloud")
## Loading required package: RColorBrewer
ttidy_books_austen |>
anti_join(stop_words) |>
count(word) |>
with(wordcloud(word, n, max.words = 50))
## Joining with `by = join_by(word)`

install.packages("tm", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5m/4f5rvwrn5rngf6j4gpl2mc9w0000gn/T//RtmprBSJLw/downloaded_packages
library("tm")
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
# Perform sentiment analysis using the Bing lexicon and count occurrences of each word
word_counts <- ttidy_books_austen |>
inner_join(get_sentiments("bing")) |>
count(word, sentiment, sort = TRUE)
## Joining with `by = join_by(word)`
## Warning in inner_join(ttidy_books_austen, get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Reshape the data to create a matrix with words as rows and sentiments as columns
word_matrix <- xtabs(n ~ word + sentiment, data = word_counts)
# Generate a word cloud comparison
comparison.cloud(word_matrix, colors = c("blue", "green"), max.words = 100)

# Filter Bing lexicon for negative sentiments
bing_negative <- get_sentiments("bing") |>
filter(sentiment == "negative")
# Count words per book and chapter
word_counts <- ttidy_books_austen |>
group_by(book) |>
summarize(words = n())
# Count negative words per book and chapter
negative_word_counts <- ttidy_books_austen |>
semi_join(bing_negative) |>
group_by(book) |>
summarize(negativewords = n())
## Joining with `by = join_by(word)`