1 Overview

This report reproduces the primary example from Chapter 2: Sentiment Analysis from Text Mining with R by Julia Silge & David Robinson, then extends it by analyzing a Portuguese book that is one of my personal favorites—Amor de Perdição, by Camilo Castelo Branco—and incorporating an additional sentiment lexicon from another R package (Portuguese OpLexicon via lexiconPT).

2 Citations

Primary reference (book):
Silge, J., & Robinson, D. (2017). Text Mining with R: A Tidy Approach. O’Reilly. https://www.tidytextmining.com/

Key packages / resources used: tidyverse, tidytext, ggplot2, janeaustenr, gutenbergr (Gutenberg ID 16425), stopwords (Portuguese stop words), lexiconPT (OpLexicon v3.0), lexicon (Bing/NRC fallback), and optionally afinn (AFINN fallback).

3 Packages

library(tidyverse)
library(tidytext)
library(janeaustenr)
library(gutenbergr)
library(stopwords)
library(lexicon)
library(lexiconPT)

4 Primary Example from Text Mining with R (Chaper 2): Jane Austen + bing / AFINN / nrc

tidy_books <- austen_books() %>%
  group_by(book) %>%
  mutate(
    linenumber = row_number(),
    # Avoid \d escape; use explicit digit/roman class:
    chapter = cumsum(str_detect(text, regex("^chapter [0-9ivxlc]+", ignore_case = TRUE)))
  ) %>%
  ungroup() %>%
  unnest_tokens(word, text)

nrc_try <- try(tidytext::get_sentiments("nrc"), silent = TRUE)
## Do you want to download:
##  Name: NRC Word-Emotion Association Lexicon 
##  URL: http://saifmohammad.com/WebPages/lexicons.html 
##  License: License required for commercial use. Please contact Saif M. Mohammad (saif.mohammad@nrc-cnrc.gc.ca). 
##  Size: 22.8 MB (cleaned 424 KB) 
##  Download mechanism: http 
##  Citation info:
## 
## This dataset was published in Saif M. Mohammad and Peter Turney. (2013), ``Crowdsourcing a Word-Emotion Association Lexicon.'' Computational Intelligence, 29(3): 436-465.
## 
## article{mohammad13,
## author = {Mohammad, Saif M. and Turney, Peter D.},
## title = {Crowdsourcing a Word-Emotion Association Lexicon},
## journal = {Computational Intelligence},
## volume = {29},
## number = {3},
## pages = {436-465},
## doi = {10.1111/j.1467-8640.2012.00460.x},
## url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1467-8640.2012.00460.x},
## eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/j.1467-8640.2012.00460.x},
## year = {2013}
## }
## If you use this lexicon, then please cite it.
if (inherits(nrc_try, "try-error")) {
  nrc_hash <- lexicon::hash_sentiment_nrc
  if (is.data.frame(nrc_hash) && all(c("x","y") %in% names(nrc_hash))) {
    if (is.numeric(nrc_hash$y)) {
      nrc <- tibble(word = nrc_hash$x, sentiment = ifelse(nrc_hash$y > 0, "positive", "negative"))
    } else {
      nrc <- tibble(word = nrc_hash$x, sentiment = as.character(nrc_hash$y))
    }
  } else if (is.list(nrc_hash) || is.environment(nrc_hash)) {
    tmp <- lapply(names(nrc_hash), function(s) tibble(word = nrc_hash[[s]], sentiment = s))
    nrc <- bind_rows(tmp)
  } else {
    nrc <- tibble(word = names(nrc_hash), sentiment = as.character(nrc_hash))
  }
} else {
  nrc <- nrc_try
}

nrc_joy <- nrc %>% filter(sentiment == "joy")

emma_joy_counts <- tidy_books %>%
  filter(book == "Emma") %>%
  inner_join(nrc_joy, by = "word") %>%
  count(word, sort = TRUE)

head(emma_joy_counts, 15)
## # A tibble: 0 × 2
## # ℹ 2 variables: word <chr>, n <int>

5 Fig. 2.2 of chapter 2 Sentiment over narrative time with bing

bing_try <- try(tidytext::get_sentiments("bing"), silent = TRUE)
if (inherits(bing_try, "try-error")) {
  bl <- lexicon::hash_sentiment_bingliu
  bing_sent <- tibble(
    word = names(bl),
    sentiment = ifelse(as.integer(bl) > 0, "positive", "negative")
  )
} else {
  bing_sent <- bing_try
}

jane_austen_sentiment <- tidy_books %>%
  inner_join(bing_sent, by = "word") %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

ggplot(jane_austen_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(
    title = "Sentiment through the narratives of Jane Austen’s novels (bing)",
    x = "Narrative index (80-line chunks)",
    y = "Net sentiment (positive - negative)"
  )

6 Compare AFINN, bing, NRC on Pride & Prejudice

pride_prejudice <- tidy_books %>% filter(book == "Pride & Prejudice")

afinn <- NULL
afinn_try <- try(tidytext::get_sentiments("afinn"), silent = TRUE)
## Do you want to download:
##  Name: AFINN-111 
##  URL: http://www2.imm.dtu.dk/pubdb/views/publication_details.php?id=6010 
##  License: Open Database License (ODbL) v1.0 
##  Size: 78 KB (cleaned 59 KB) 
##  Download mechanism: https
if (!inherits(afinn_try, "try-error")) {
  afinn <- afinn_try
} else if (requireNamespace("afinn", quietly = TRUE)) {
  afinn <- afinn::afinn
  if (!all(c("word","value") %in% names(afinn))) afinn <- NULL
}

afinn_df <- NULL
if (!is.null(afinn)) {
  afinn_df <- pride_prejudice %>%
    inner_join(afinn, by = "word") %>%
    group_by(index = linenumber %/% 80) %>%
    summarise(sentiment = sum(value), .groups = "drop") %>%
    mutate(method = "AFINN")
}

bing_df <- pride_prejudice %>%
  inner_join(bing_sent, by = "word") %>%
  mutate(method = "Bing et al.")

nrc_posneg <- nrc %>% filter(sentiment %in% c("positive","negative"))
nrc_df <- pride_prejudice %>%
  inner_join(nrc_posneg, by = "word") %>%
  mutate(method = "NRC")

bing_and_nrc <- bind_rows(bing_df, nrc_df) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)

combo <- if (!is.null(afinn_df)) bind_rows(afinn_df, bing_and_nrc) else bing_and_nrc

ggplot(combo, aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y") +
  labs(
    title = "Comparing AFINN, Bing, and NRC on Pride & Prejudice",
    x = "Narrative index (80-line chunks)",
    y = "Sentiment / Net sentiment"
  )

7 Extension 1 — Portuguese corpus: Amor de Perdição (Camilo Castelo Branco)

camilo_raw <- gutenberg_download(16425)

camilo <- camilo_raw %>%
  mutate(linenumber = row_number())

tidy_camilo <- camilo %>%
  tidytext::unnest_tokens(word, text, token = "words")

stop_pt <- tibble(word = stopwords::stopwords("pt"))

tidy_camilo <- tidy_camilo %>%
  anti_join(stop_pt, by = "word") %>%
  filter(str_detect(word, "^[[:alpha:]]+$"))

head(tidy_camilo)
## # A tibble: 6 × 3
##   gutenberg_id linenumber word    
##          <int>      <int> <chr>   
## 1        16425          1 amor    
## 2        16425          1 perdi   
## 3        16425          6 amor    
## 4        16425          6 perdi   
## 5        16425          9 memorias
## 6        16425          9 familia

7.1 Extension 2 — Additional lexicon: OpLexicon (Portuguese)

data(oplexicon_v3.0, package = "lexiconPT")

pt_lex <- oplexicon_v3.0 %>%
  transmute(word = term, value = as.integer(polarity))

summary(pt_lex$value)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.0000 -1.0000  0.0000 -0.1848  1.0000  1.0000
head(pt_lex)
##   word value
## 1   =[    -1
## 2   =@    -1
## 3   =p    -1
## 4   =P    -1
## 5   =x    -1
## 6   =d     1

7.1.1 Narrative-time sentiment (OpLexicon)

camilo_oplex <- tidy_camilo %>%
  inner_join(pt_lex, by = "word") %>%
  mutate(chunk = (linenumber %/% 100) + 1) %>%
  group_by(chunk) %>%
  summarise(score = sum(value), .groups = "drop")

ggplot(camilo_oplex, aes(chunk, score)) +
  geom_line(linewidth = 0.8) +
  labs(
    title = "Amor de Perdição: Sentiment over Narrative Time (OpLexicon, PT)",
    x = "Narrative chunk (~100 lines)",
    y = "Sum of OpLexicon polarity"
  )

7.1.2 Top positive/negative contributors (OpLexicon)

camilo_words_scored <- tidy_camilo %>%
  inner_join(pt_lex, by = "word")

camilo_posneg <- camilo_words_scored %>%
  mutate(sentiment = case_when(
    value > 0 ~ "positive",
    value < 0 ~ "negative",
    TRUE      ~ "neutral"
  )) %>%
  filter(sentiment != "neutral") %>%
  count(word, sentiment, sort = TRUE) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 12) %>%
  ungroup() %>%
  mutate(word = reorder_within(word, n, sentiment))

ggplot(camilo_posneg, aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  scale_y_reordered() +
  labs(
    title = "Amor de Perdição: Top Sentiment Terms (OpLexicon, PT)",
    x = "Frequency",
    y = NULL
  )

8 Conclusion

In conclusion, this report closely follows the structure of Chapter 2 of TextMining with R, beginning with the identification of NRC “joy” words in Emma, continuing with the analysis of Bing sentiment scores and concluding with a comparison of AFINN, Bing, and NRC sentiment methods in Pride and Prejudice. The analysis is then extended to a Portuguese book, Amor de Perdição by Camilo Castelo Branco, who is one my favorite Portuguese authors, using the OpLexicon sentiment lexicon to ensure linguistic and cultural accuracy for Portuguese-language data.