Análisis de textos de Jane Austen

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.5.2

library(viridisLite)
library(janeaustenr)

## Warning: package 'janeaustenr' was built under R version 4.5.2

library(stopwords)

## Warning: package 'stopwords' was built under R version 4.5.2

library(gridExtra)

## 
## Adjuntando el paquete: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(gt)

austen_texts <-  austen_books()

austen_limpio <- austen_books() %>% 
  mutate(text = gsub("[[:punct:]]", "", text)) %>% 
  mutate(text = tolower(text)) %>%
  mutate(text = gsub("[[:digit:]]", "", text)) %>% 
  filter(!grepl("jane|austen|chapter", text))

austen_tokens <- austen_limpio %>% 
  unnest_tokens(word, text, token = "words") %>% 
  anti_join(get_stopwords(source = "stopwords-iso"))

## Joining with `by = join_by(word)`

any(is.na(austen_tokens$word))

## [1] FALSE

austen_tokens %>% 
  count(word, sort= TRUE)

## # A tibble: 18,106 × 2
##    word          n
##    <chr>     <int>
##  1 time       1289
##  2 fanny       849
##  3 lady        814
##  4 sir         791
##  5 emma        737
##  6 day         710
##  7 sister      695
##  8 house       661
##  9 elizabeth   654
## 10 elinor      616
## # ℹ 18,096 more rows

austen_sent <- austen_tokens %>% 
  inner_join(get_sentiments("bing")) %>% 
  group_by(book,sentiment) %>% 
  summarise(n= n()) %>% 
  ungroup() %>% 
  arrange(desc(book))

## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("bing")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 121757 of `x` matches multiple rows in `y`.
## ℹ Row 5051 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.

austen_sent %>% 
  gt() %>%
  tab_header(
    title = "Análisis de sentimiento",
    subtitle = "Cantidad de palabras positivas y negativas, agrupadas por libro")

book	sentiment	n
Análisis de sentimiento
Cantidad de palabras positivas y negativas, agrupadas por libro
Persuasion	negative	2027
Persuasion	positive	2461
Northanger Abbey	negative	2232
Northanger Abbey	positive	2368
Emma	negative	3847
Emma	positive	4787
Mansfield Park	negative	4293
Mansfield Park	positive	4773
Pride & Prejudice	negative	3211
Pride & Prejudice	positive	3700
Sense & Sensibility	negative	3314
Sense & Sensibility	positive	3739

austen_sent %>% 
  ggplot(aes(x= book, y= n, fill= sentiment))+
  geom_col(position = "dodge")+
  labs(title = "Análisis de sentimiento de los libros de Jane Austen", 
       subtitle = "Cantidad de palabras positivas y negativas, agrupadas por libro",
       x= "Libro", y = "Cantidad",
       fill= NULL)+
  theme_minimal()

Análisis de textos de Jane Austen

2025-12-19