607_week10_assignment

Starting with the base code in Chapter 2 of the book “Text Mining with R” by Silge and Robinson in their analysis on sentiment analysis (Silge & Robinson, n.d.):

Silge, J., & Robinson, D. (n.d.). Sentiment analysis. In Text Mining with R: A Tidy Approach. Retrieved March 31, 2024, from https://www.tidytextmining.com/sentiment.html

library(tidytext)

# The sentiments datasets
get_sentiments("afinn")

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

# Sentiment analysis with inner join
library(janeaustenr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", 
                                      ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_joy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 301 × 2
##    word          n
##    <chr>     <int>
##  1 good        359
##  2 friend      166
##  3 hope        143
##  4 happy       125
##  5 love        117
##  6 deal         92
##  7 found        92
##  8 present      89
##  9 kind         82
## 10 happiness    76
## # ℹ 291 more rows

library(tidyr)
jane_austen_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

library(ggplot2)
ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

# Comparing the three sentiment dictionaries
pride_prejudice <- tidy_books %>% 
  filter(book == "Pride & Prejudice")
pride_prejudice

## # A tibble: 122,204 × 4
##    book              linenumber chapter word     
##    <fct>                  <int>   <int> <chr>    
##  1 Pride & Prejudice          1       0 pride    
##  2 Pride & Prejudice          1       0 and      
##  3 Pride & Prejudice          1       0 prejudice
##  4 Pride & Prejudice          3       0 by       
##  5 Pride & Prejudice          3       0 jane     
##  6 Pride & Prejudice          3       0 austen   
##  7 Pride & Prejudice          7       1 chapter  
##  8 Pride & Prejudice          7       1 1        
##  9 Pride & Prejudice         10       1 it       
## 10 Pride & Prejudice         10       1 is       
## # ℹ 122,194 more rows

afinn <- pride_prejudice %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(value)) %>% 
  mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  pride_prejudice %>% 
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."),
  pride_prejudice %>% 
    inner_join(get_sentiments("nrc") %>% 
                 filter(sentiment %in% c("positive", 
                                         "negative"))
    ) %>%
    mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>% 
  mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 215 of `x` matches multiple rows in `y`.
## ℹ Row 5178 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bind_rows(afinn, 
          bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

# Most common positive and negative words
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

bing_word_counts

## # A tibble: 2,585 × 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 miss     negative   1855
##  2 well     positive   1523
##  3 good     positive   1380
##  4 great    positive    981
##  5 like     positive    725
##  6 better   positive    639
##  7 enough   positive    613
##  8 happy    positive    534
##  9 love     positive    495
## 10 pleasure positive    462
## # ℹ 2,575 more rows

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

custom_stop_words <- bind_rows(tibble(word = c("miss"),  
                                      lexicon = c("custom")), 
                               stop_words)
custom_stop_words

## # A tibble: 1,150 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 miss        custom 
##  2 a           SMART  
##  3 a's         SMART  
##  4 able        SMART  
##  5 about       SMART  
##  6 above       SMART  
##  7 according   SMART  
##  8 accordingly SMART  
##  9 across      SMART  
## 10 actually    SMART  
## # ℹ 1,140 more rows

# Wordclouds
library(wordcloud)

## Loading required package: RColorBrewer

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 435434 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Looking at units beyond just words
p_and_p_sentences <- tibble(text = prideprejudice) %>% 
  unnest_tokens(sentence, text, token = "sentences")
p_and_p_sentences$sentence[2]

## [1] "by jane austen"

austen_chapters <- austen_books() %>%
  group_by(book) %>%
  unnest_tokens(chapter, text, token = "regex", 
                pattern = "Chapter|CHAPTER [\\dIVXLC]") %>%
  ungroup()
austen_chapters %>% 
  group_by(book) %>% 
  summarise(chapters = n())

## # A tibble: 6 × 2
##   book                chapters
##   <fct>                  <int>
## 1 Sense & Sensibility       51
## 2 Pride & Prejudice         62
## 3 Mansfield Park            49
## 4 Emma                      56
## 5 Northanger Abbey          32
## 6 Persuasion                25

bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")
wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())

## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  slice_max(ratio, n = 1) %>% 
  ungroup()

## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

## # A tibble: 6 × 5
##   book                chapter negativewords words  ratio
##   <fct>                 <int>         <int> <int>  <dbl>
## 1 Sense & Sensibility      43           161  3405 0.0473
## 2 Pride & Prejudice        34           111  2104 0.0528
## 3 Mansfield Park           46           173  3685 0.0469
## 4 Emma                     15           151  3340 0.0452
## 5 Northanger Abbey         21           149  2982 0.0500
## 6 Persuasion                4            62  1807 0.0343

Now exploring the works of a few famous author (with and without depression) using the Gutenberg Project:

I want to explore the works of famous authos that suffered from depression and check if the sentiment analysis of their works differed from those authors known to not have such illness. I will rely on the Gutenberg project too. One note here is that listing certain famous authors as a comparison group for “not having depression” is inherently more challenging because of the inability to accurately confirm (since they may have had it, but never been diagnosed with it, or kept it private due to the societal stigma around mental illness in previous historical periods). So this is for illustrative purposes for the most part.

The authors I want to include in this analysis are: - Those with Depression: Friedrich Nietzsche, Virginia Woolf, Edgar Allan Poe, Ernest Hemingway, Fyodor Dostoevsky, Leo Tolstoy. - Those known to not have had depression (comparison): Jane Austen, Mark Twain, Agatha Christie, George Bernard Shaw, Louisa May Alcott.

# install.packages("gutenbergr")
library(gutenbergr)
library(dplyr)

## function to download works and retain necessary metadata
download_works_with_metadata <- function(author_name) {
  works <- gutenberg_works(author == author_name) %>%
    gutenberg_download(meta_fields = c("author", "title", "gutenberg_id", "language", "gutenberg_bookshelf", "rights")) %>%
    mutate(author = author_name) # Ensure consistent author naming
  return(works)
}

## Authors known to have had Depression
nietzsche_works <- download_works_with_metadata("Nietzsche, Friedrich Wilhelm")
woolf_works <- download_works_with_metadata("Woolf, Virginia")
poe_works <- download_works_with_metadata("Poe, Edgar Allan")
hemingway_works <- download_works_with_metadata("Hemingway, Ernest")
dostoevsky_works <- download_works_with_metadata("Dostoyevsky, Fyodor")
tolstoy_works <- download_works_with_metadata("Tolstoy, Leo, graf")

## Authors known to not have had Depression
austen_works <- download_works_with_metadata("Austen, Jane")
twain_works <- download_works_with_metadata("Twain, Mark")
christtie_works <- download_works_with_metadata("Christie, Agatha")
shaw_works <- download_works_with_metadata("Shaw, Bernard")
alcott_works <- download_works_with_metadata("Alcott, Louisa May")

Combining the Data Frames:

## Combine authors known to have had depression
depression_authors_works <- rbind(nietzsche_works, woolf_works, poe_works, 
                                  hemingway_works, dostoevsky_works, tolstoy_works)

## Combine authors known to not have had depression
non_depression_authors_works <- rbind(austen_works, twain_works, 
                                      christtie_works, shaw_works, alcott_works)

# Saving the work so far in CSV files (to reduce having to redo all the steps if need be later):
write.csv(depression_authors_works, "C:/Users/teraw/Dropbox/_CUNY SPS MSDS/-- DATA 607/5- Modules/Week 10/depression_authors_works.csv", row.names = TRUE)

write.csv(non_depression_authors_works, "C:/Users/teraw/Dropbox/_CUNY SPS MSDS/-- DATA 607/5- Modules/Week 10/non_depression_authors_works.csv", row.names = TRUE)

Now I am going to clean and tidy the data: I will convert the texts into a tidy format where each row contains a single word. In other words, what I am going to do next is “tokenization”. Then, I’ll remove common stopwords (words like “the”, “and”, etc.), that don’t contribute much meaning to the sentiment analysis). I will also add steps for:

Lowercasing all words so that the same words are recognized as the same token, regardless of their case.
Removing punctuation and numbers to focus only on words

library(tidytext)
library(dplyr)
library(stringr)

# Tokenization
depression_authors_works_tidy <- depression_authors_works %>%
  unnest_tokens(word, text)
non_depression_authors_works_tidy <- non_depression_authors_works %>%
  unnest_tokens(word, text)

# Remove stopwords
data("stop_words")
depression_authors_works_tidy <- depression_authors_works_tidy %>%
  anti_join(stop_words, by = "word")
non_depression_authors_works_tidy <- non_depression_authors_works_tidy %>%
  anti_join(stop_words, by = "word")

# lowercasing and removing punctuation/numbers
depression_authors_works_tidy <- depression_authors_works_tidy %>%
  mutate(word = tolower(word)) %>%
  filter(!str_detect(word, "^[0-9]+$"))
non_depression_authors_works_tidy <- non_depression_authors_works_tidy %>%
  mutate(word = tolower(word)) %>%
  filter(!str_detect(word, "^[0-9]+$"))

Performing the sentiment analysis.

Using the bing sentiment lexicon:

library(tidytext)
library(ggplot2)
library(tidyr)
library(dplyr)


# Sentiment analysis for depression group
depression_sentiment <- depression_authors_works_tidy %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  group_by(gutenberg_id) %>%
  count(sentiment) %>%
  spread(key = sentiment, value = n, fill = 0) %>%
  mutate(sentiment_score = positive - negative) %>%
  ungroup()

## Warning in inner_join(., get_sentiments("bing"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 28665 of `x` matches multiple rows in `y`.
## ℹ Row 2102 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Sentiment analysis for non-depression group
non_depression_sentiment <- non_depression_authors_works_tidy %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  group_by(gutenberg_id) %>%
  count(sentiment) %>%
  spread(key = sentiment, value = n, fill = 0) %>%
  mutate(sentiment_score = positive - negative) %>%
  ungroup()

## Warning in inner_join(., get_sentiments("bing"), by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 107165 of `x` matches multiple rows in `y`.
## ℹ Row 6514 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Merge sentiment scores with the original works data to include author and title
depression_sentiment_enriched <- merge(depression_sentiment, depression_authors_works_tidy[, c("gutenberg_id", "author", "title")], by = "gutenberg_id", all.x = TRUE)

non_depression_sentiment_enriched <- merge(non_depression_sentiment, non_depression_authors_works_tidy[, c("gutenberg_id", "author", "title")], by = "gutenberg_id", all.x = TRUE)

# Plot for depression group
ggplot(depression_sentiment_enriched, aes(x = reorder(author, sentiment_score), y = sentiment_score, fill = author)) +
  geom_col() +
  coord_flip() +
  labs(title = "Sentiment Score by Author for Depression Group",
       x = "Author",
       y = "Sentiment Score") +
  theme(legend.position = "none")

# Plot for non-depression group
ggplot(non_depression_sentiment_enriched, aes(x = reorder(author, sentiment_score), y = sentiment_score, fill = author)) +
  geom_col() +
  coord_flip() +
  labs(title = "Sentiment Score by Author for Non-Depression Group",
       x = "Author",
       y = "Sentiment Score") +
  theme(legend.position = "none")

Combining the plots for a better visual comparison:

depression_sentiment_enriched$group <- "Depression"
non_depression_sentiment_enriched$group <- "Non-Depression"

combined_sentiment_data <- rbind(depression_sentiment_enriched, non_depression_sentiment_enriched)

ggplot(combined_sentiment_data, aes(x = reorder(author, sentiment_score), y = sentiment_score, fill = group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.7) +
  coord_flip() +
  facet_wrap(~group, scales = "free", ncol = 1) +
  scale_fill_manual(values = c("Depression" = "#FF9999", "Non-Depression" = "#9999FF")) +
  labs(title = "Sentiment Score by Author Group",
       x = "Author",
       y = "Sentiment Score") +
  theme_minimal() +
  theme(legend.position = "top",
        plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
        axis.title.x = element_text(size = 12, face = "bold"),
        axis.title.y = element_text(size = 12, face = "bold"),
        strip.text = element_text(size = 12, face = "bold"),
        legend.text = element_text(size = 12),
        legend.title = element_text(size = 12, face = "bold")) +
  guides(fill = guide_legend(title = "Group"))

Conclusion:

The sentiment scores are calculated based on the words used in their texts, with positive sentiment contributing to a higher score and negative sentiment contributing to a lower score.
All authors in the “Depression” group have negative sentiment scores. This indicates that, on average, words associated with negative sentiment outweigh those associated with positive sentiment in their works. It’s also notable that some authors, such as Nietzsche, have sentiment scores that are more negative than others.
he “Non-Depression” group shows a mix of negative and positive sentiment scores. This suggests more variability in sentiment across these authors’ works. For example, Louisa May Alcott shows a positive sentiment score, which implies that positive words are more prevalent in her texts.
There seems to be a trend where the group known to have had depression has consistently lower sentiment scores than the group not known to have had depression. However, it’s important to note that there are exceptions, such as Agatha Christie, who also shows a negative sentiment score.

Caution with the previous conclusions (ie, Limitations of this analysis): Interpretation of sentiment scores should take into account the context in which these authors wrote, their literary styles, and the genres they worked in. Additionally, sentiment analysis based on word occurrence does not capture literary nuances such as sarcasm, irony, and complex emotional expression.

Another Way: Using the SentiWordNet sentiment lexicon:

SentiWordNet assigns to each synset of WordNet three sentiment scores: positivity, negativity, and objectivity. SentiWordNet is described in details in the papers: https://github.com/aesuli/SentiWordNet/blob/master/papers/LREC06.pdf and https://github.com/aesuli/SentiWordNet/blob/master/papers/LREC10.pdf

# SentiWordNet can be downloaded from its official repo
sentiwordnet_url <- "https://raw.githubusercontent.com/aesuli/SentiWordNet/master/data/SentiWordNet_3.0.0.txt"
download.file(sentiwordnet_url, destfile = "SentiWordNet_3.0.0.txt")

# Read SentiWordNet data, skipping initial comments
sentiwordnet <- read.delim("SentiWordNet_3.0.0.txt", comment.char = "#", header = TRUE, stringsAsFactors = FALSE)

# Assign correct column names based on the data structure
colnames(sentiwordnet) <- c("POS", "ID", "PosScore", "NegScore", "SynsetTerms")

# Process the SynsetTerms column to extract individual words
sentiwordnet <- sentiwordnet %>%
  mutate(word = gsub("#[0-9]+", "", SynsetTerms)) %>%
  select(word, PosScore, NegScore)


library(dplyr)
library(tidyr)

# Join with SentiWordNet and Calculate Sentiment for Depression Group
depression_sentiment_analysis <- depression_authors_works_tidy %>%
  inner_join(sentiwordnet, by = "word") %>%
  group_by(gutenberg_id, author, title) %>%
  summarise(PositivityScore = sum(PosScore, na.rm = TRUE),
            NegativityScore = sum(NegScore, na.rm = TRUE),
            NetSentimentScore = PositivityScore - NegativityScore) %>%
  ungroup()

## Warning in inner_join(., sentiwordnet, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 6199 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## `summarise()` has grouped output by 'gutenberg_id', 'author'. You can override
## using the `.groups` argument.

# Join with SentiWordNet and Calculate Sentiment for Non-Depression Group
non_depression_sentiment_analysis <- non_depression_authors_works_tidy %>%
  inner_join(sentiwordnet, by = "word") %>%
  group_by(gutenberg_id, author, title) %>%
  summarise(PositivityScore = sum(PosScore, na.rm = TRUE),
            NegativityScore = sum(NegScore, na.rm = TRUE),
            NetSentimentScore = PositivityScore - NegativityScore) %>%
  ungroup()

## Warning in inner_join(., sentiwordnet, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 5 of `x` matches multiple rows in `y`.
## ℹ Row 52704 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## `summarise()` has grouped output by 'gutenberg_id', 'author'. You can override
## using the `.groups` argument.

# Visualize 
library(ggplot2)

# Combine the results for visualization
combined_sentiment_analysis <- bind_rows(
  depression_sentiment_analysis %>% mutate(Group = "Depression"),
  non_depression_sentiment_analysis %>% mutate(Group = "Non-Depression")
)

ggplot(combined_sentiment_analysis, aes(x = reorder(author, NetSentimentScore), y = NetSentimentScore, fill = Group)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
  coord_flip() + 
  scale_fill_manual(values = c("Depression" = "#E69F00", "Non-Depression" = "#56B4E9")) + 
  labs(title = "Net Sentiment Score by Author Group",
       x = "Author",
       y = "Net Sentiment Score") +
  facet_wrap(~Group, scales = "free_y", ncol = 1) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    legend.title = element_text(hjust = 0.5), # Updated code for horizontal justification
    axis.title.x = element_text(size = 12, face = "bold"),
    axis.title.y = element_text(size = 12, face = "bold"),
    strip.text = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 10),
    plot.title = element_text(hjust = 0.5)
  )

Conclusion: In this other method for sentiment analysis, both groups display positive net sentiment scores, the range of scores is wider in the depression group. There does not appear to be a clear distinction between the groups based on net sentiment alone.

607_week10_assignment

2024-03-31

Now exploring the works of a few famous author (with and without depression) using the Gutenberg Project:

Performing the sentiment analysis.

Using the bing sentiment lexicon:

Another Way: Using the SentiWordNet sentiment lexicon: