Instalar paquetes y llamar librerías

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(syuzhet)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(knitr)

Cargar base de datos

movies <- read.csv("~/Desktop/movie_reviews.csv", stringsAsFactors = FALSE)

movies %>%
  select(-review_text) %>%
  head(3)
##   X review_id      movie_name year reviewer_name rated year_api
## 1 0         1 Lethal Weapon 3 1992   J. Boyajian     R     1992
## 2 1         2 Lethal Weapon 3 1999 Frank Maloney     R     1992
## 3 2         3 Lethal Weapon 3 1995      Brian L.     R     1992
##                     genre      directors
## 1 Action, Crime, Thriller Richard Donner
## 2 Action, Crime, Thriller Richard Donner
## 3 Action, Crime, Thriller Richard Donner
##                                        writers
## 1 Jeffrey Boam, Robert Mark Kamen, Shane Black
## 2 Jeffrey Boam, Robert Mark Kamen, Shane Black
## 3 Jeffrey Boam, Robert Mark Kamen, Shane Black
##                                actors
## 1 Mel Gibson, Danny Glover, Joe Pesci
## 2 Mel Gibson, Danny Glover, Joe Pesci
## 3 Mel Gibson, Danny Glover, Joe Pesci
##                                                                                                                                                                                            plot
## 1 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
## 2 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
## 3 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
##   first_genre first_actor first_director first_writer first_actor_gender
## 1      Action  Mel Gibson Richard Donner Jeffrey Boam             female
## 2      Action  Mel Gibson Richard Donner Jeffrey Boam             female
## 3      Action  Mel Gibson Richard Donner Jeffrey Boam             female
##   first_director_gender first_writer_gender
## 1                  male                male
## 2                  male                male
## 3                  male                male

Limpieza básica del texto

movies2 <- movies %>%
  filter(!is.na(review_text), review_text != "") %>%
  mutate(
    review_text = tolower(review_text),
    review_text = gsub("[^a-z\\s']", " ", review_text),
    review_text = gsub("\\s+", " ", review_text),
    review_text = trimws(review_text)
  )

nrow(movies2)
## [1] 18862

Mejores y peores películas por sentimiento

movies2$sent_score <- get_sentiment(movies2$review_text, method = "afinn")

pelis_rank <- movies2 %>%
  group_by(movie_name) %>%
  summarise(
    n_reviews = n(),
    sent_promedio = mean(sent_score),
    sent_mediana = median(sent_score),
    .groups = "drop"
  ) %>%
  filter(n_reviews >= 5) %>%
  arrange(desc(sent_promedio))

top5 <- pelis_rank %>% slice_head(n = 5)
bottom5 <- pelis_rank %>% arrange(sent_promedio) %>% slice_head(n = 5)

top5
## # A tibble: 5 × 4
##   movie_name                    n_reviews sent_promedio sent_mediana
##   <chr>                             <int>         <dbl>        <dbl>
## 1 Beauty and the Beast                  7          46             41
## 2 Almost Famous                        23          44.9           45
## 3 Tous les matins du monde              5          44             45
## 4 Billy's Hollywood Screen Kiss         5          42.6           37
## 5 Wild Man Blues                        7          42.3           36
bottom5
## # A tibble: 5 × 4
##   movie_name                                n_reviews sent_promedio sent_mediana
##   <chr>                                         <int>         <dbl>        <dbl>
## 1 Paradise Lost: The Child Murders at Robi…         6         -37.3        -44.5
## 2 Bandit Queen                                      6         -34.8        -26  
## 3 M                                                 6         -27.7        -26  
## 4 Jason Goes to Hell: The Final Friday              5         -23.8        -20  
## 5 Bully                                             6         -23.3        -27.5

Gráfica: Top 10

top10 <- pelis_rank %>% slice_head(n = 10) %>% mutate(tipo = "Top 10")
bot10 <- pelis_rank %>% arrange(sent_promedio) %>% slice_head(n = 10) %>% mutate(tipo = "Bottom 10")

rank_plot <- bind_rows(top10, bot10) %>%
  mutate(movie_name = reorder(movie_name, sent_promedio))

ggplot(rank_plot, aes(x = movie_name, y = sent_promedio)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~tipo, scales = "free_y") +
  labs(
    title = "Mejores y peores películas por sentimiento (AFINN)",
    x = "Película",
    y = "Sentimiento promedio"
  )

Análisis de Emociones y Sentimientos

# <span style="color:red">Análisis de Emociones y Sentimientos</span>
library(dplyr)
library(syuzhet)

set.seed(1)

# 1) Solo 200 reseñas (súper rápido). Cambia 200 si quieres.
muestra <- movies2 %>% slice_sample(n = min(200, nrow(movies2)))

# 2) Junta texto y recorta (para no procesar demasiado)
texto <- paste(muestra$review_text, collapse = " ")
texto <- substr(texto, 1, 50000)   # cambia 50000 si quieres (30k-80k)

# 3) Tokens y emociones
tokens <- get_tokens(texto)
emociones <- get_nrc_sentiment(tokens, language = "english")

par(mar=c(6,4,2,1))
barplot(colSums(emociones[,1:8]),
        las=2, main="Distribución de emociones (NRC)")

sentimientos <- (emociones$negative * -1) + emociones$positive
simple_plot(sentimientos)

Nube de Palabras

library(wordcloud)
library(RColorBrewer)
library(tm)

# usar tokens que ya calculaste
palabras <- tokens

# limpiar palabras comunes
palabras <- removeWords(
  palabras,
  c(stopwords("english"),
    "film","movie","one","like","just","really",
    "can","will","get","make","see","even","also")
)

set.seed(123)

wordcloud(
  words = palabras,
  max.words = 70,     
  min.freq = 6,       
  random.order = FALSE,
  rot.per = 0.1,
  colors = brewer.pal(8, "Dark2")
)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Incluir PNG/GIF de las películas mencionadas

knitr::include_graphics(c(
  "img/images1.jpg",
  "img/images2.jpg",
  "img/images3.jpg",
  "img/images4.jpg",
  "img/images5.jpg"
))

list.files("img")
## [1] "images1.jpg" "images2.jpg" "images3.jpg" "images4.jpg" "images5.jpg"

Conclusiones

Con la base movie_reviews.csv se identificaron películas con mejor y peor percepción usando sentimiento promedio, además de un resumen de emociones (NRC) y una nube de palabras con los términos más frecuentes.

LS0tCnRpdGxlOiAiTW92aWVfcmV2aWV3cyAtIEEwMTI4NjMwMCIKYXV0aG9yOiAiUGFvbGEgU2FsYXMsIFJvYmVydGEgRmFsY29uLCBSdWJlbiBSb2JsZXMsIFNhbWFudGhhIEdhY2lhLCBNYXJpYW5hIEdhcmliYXkiCmRhdGU6ICIyMDI2LTAyLTIzIgpvdXRwdXQ6CiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICB0aGVtZTogY29zbW8KZWRpdG9yX29wdGlvbnM6CiAgbWFya2Rvd246CiAgICB3cmFwOiA3MgotLS0KCiMgWyoqSW5zdGFsYXIgcGFxdWV0ZXMgeSBsbGFtYXIgbGlicmVyw61hcyoqXXtzdHlsZT0iY29sb3I6ICNmZjAwN2Y7In0KCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShzeXV6aGV0KQpsaWJyYXJ5KHRtKQpsaWJyYXJ5KHdvcmRjbG91ZCkKbGlicmFyeShSQ29sb3JCcmV3ZXIpCmxpYnJhcnkoa25pdHIpCmBgYAoKIyBbKipDYXJnYXIgYmFzZSBkZSBkYXRvcyoqXXtzdHlsZT0iY29sb3I6ICNmZjAwN2YifQoKYGBge3J9Cm1vdmllcyA8LSByZWFkLmNzdigifi9EZXNrdG9wL21vdmllX3Jldmlld3MuY3N2Iiwgc3RyaW5nc0FzRmFjdG9ycyA9IEZBTFNFKQoKbW92aWVzICU+JQogIHNlbGVjdCgtcmV2aWV3X3RleHQpICU+JQogIGhlYWQoMykKYGBgCgojIFsqKkxpbXBpZXphIGLDoXNpY2EgZGVsIHRleHRvKipde3N0eWxlPSJjb2xvcjogI2ZmMDA3ZjsifQoKYGBge3J9Cm1vdmllczIgPC0gbW92aWVzICU+JQogIGZpbHRlcighaXMubmEocmV2aWV3X3RleHQpLCByZXZpZXdfdGV4dCAhPSAiIikgJT4lCiAgbXV0YXRlKAogICAgcmV2aWV3X3RleHQgPSB0b2xvd2VyKHJldmlld190ZXh0KSwKICAgIHJldmlld190ZXh0ID0gZ3N1YigiW15hLXpcXHMnXSIsICIgIiwgcmV2aWV3X3RleHQpLAogICAgcmV2aWV3X3RleHQgPSBnc3ViKCJcXHMrIiwgIiAiLCByZXZpZXdfdGV4dCksCiAgICByZXZpZXdfdGV4dCA9IHRyaW13cyhyZXZpZXdfdGV4dCkKICApCgpucm93KG1vdmllczIpCmBgYAoKIyBbKipNZWpvcmVzIHkgcGVvcmVzIHBlbMOtY3VsYXMgcG9yIHNlbnRpbWllbnRvKipde3N0eWxlPSJjb2xvcjogI2ZmMDA3ZjsifQoKYGBge3J9Cm1vdmllczIkc2VudF9zY29yZSA8LSBnZXRfc2VudGltZW50KG1vdmllczIkcmV2aWV3X3RleHQsIG1ldGhvZCA9ICJhZmlubiIpCgpwZWxpc19yYW5rIDwtIG1vdmllczIgJT4lCiAgZ3JvdXBfYnkobW92aWVfbmFtZSkgJT4lCiAgc3VtbWFyaXNlKAogICAgbl9yZXZpZXdzID0gbigpLAogICAgc2VudF9wcm9tZWRpbyA9IG1lYW4oc2VudF9zY29yZSksCiAgICBzZW50X21lZGlhbmEgPSBtZWRpYW4oc2VudF9zY29yZSksCiAgICAuZ3JvdXBzID0gImRyb3AiCiAgKSAlPiUKICBmaWx0ZXIobl9yZXZpZXdzID49IDUpICU+JQogIGFycmFuZ2UoZGVzYyhzZW50X3Byb21lZGlvKSkKCnRvcDUgPC0gcGVsaXNfcmFuayAlPiUgc2xpY2VfaGVhZChuID0gNSkKYm90dG9tNSA8LSBwZWxpc19yYW5rICU+JSBhcnJhbmdlKHNlbnRfcHJvbWVkaW8pICU+JSBzbGljZV9oZWFkKG4gPSA1KQoKdG9wNQpib3R0b201CmBgYAoKIyBbKipHcsOhZmljYTogVG9wIDEwICoqXXtzdHlsZT0iY29sb3I6ICNmZjAwN2Y7In0KCmBgYHtyfQp0b3AxMCA8LSBwZWxpc19yYW5rICU+JSBzbGljZV9oZWFkKG4gPSAxMCkgJT4lIG11dGF0ZSh0aXBvID0gIlRvcCAxMCIpCmJvdDEwIDwtIHBlbGlzX3JhbmsgJT4lIGFycmFuZ2Uoc2VudF9wcm9tZWRpbykgJT4lIHNsaWNlX2hlYWQobiA9IDEwKSAlPiUgbXV0YXRlKHRpcG8gPSAiQm90dG9tIDEwIikKCnJhbmtfcGxvdCA8LSBiaW5kX3Jvd3ModG9wMTAsIGJvdDEwKSAlPiUKICBtdXRhdGUobW92aWVfbmFtZSA9IHJlb3JkZXIobW92aWVfbmFtZSwgc2VudF9wcm9tZWRpbykpCgpnZ3Bsb3QocmFua19wbG90LCBhZXMoeCA9IG1vdmllX25hbWUsIHkgPSBzZW50X3Byb21lZGlvKSkgKwogIGdlb21fY29sKCkgKwogIGNvb3JkX2ZsaXAoKSArCiAgZmFjZXRfd3JhcCh+dGlwbywgc2NhbGVzID0gImZyZWVfeSIpICsKICBsYWJzKAogICAgdGl0bGUgPSAiTWVqb3JlcyB5IHBlb3JlcyBwZWzDrWN1bGFzIHBvciBzZW50aW1pZW50byAoQUZJTk4pIiwKICAgIHggPSAiUGVsw61jdWxhIiwKICAgIHkgPSAiU2VudGltaWVudG8gcHJvbWVkaW8iCiAgKQpgYGAKCiMgWyoqQW7DoWxpc2lzIGRlIEVtb2Npb25lcyB5IFNlbnRpbWllbnRvcyoqXXtzdHlsZT0iY29sb3I6ICNmZjAwN2Y7In0KCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CiMgPHNwYW4gc3R5bGU9ImNvbG9yOnJlZCI+QW7DoWxpc2lzIGRlIEVtb2Npb25lcyB5IFNlbnRpbWllbnRvczwvc3Bhbj4KbGlicmFyeShkcGx5cikKbGlicmFyeShzeXV6aGV0KQoKc2V0LnNlZWQoMSkKCiMgMSkgU29sbyAyMDAgcmVzZcOxYXMgKHPDunBlciByw6FwaWRvKS4gQ2FtYmlhIDIwMCBzaSBxdWllcmVzLgptdWVzdHJhIDwtIG1vdmllczIgJT4lIHNsaWNlX3NhbXBsZShuID0gbWluKDIwMCwgbnJvdyhtb3ZpZXMyKSkpCgojIDIpIEp1bnRhIHRleHRvIHkgcmVjb3J0YSAocGFyYSBubyBwcm9jZXNhciBkZW1hc2lhZG8pCnRleHRvIDwtIHBhc3RlKG11ZXN0cmEkcmV2aWV3X3RleHQsIGNvbGxhcHNlID0gIiAiKQp0ZXh0byA8LSBzdWJzdHIodGV4dG8sIDEsIDUwMDAwKSAgICMgY2FtYmlhIDUwMDAwIHNpIHF1aWVyZXMgKDMway04MGspCgojIDMpIFRva2VucyB5IGVtb2Npb25lcwp0b2tlbnMgPC0gZ2V0X3Rva2Vucyh0ZXh0bykKZW1vY2lvbmVzIDwtIGdldF9ucmNfc2VudGltZW50KHRva2VucywgbGFuZ3VhZ2UgPSAiZW5nbGlzaCIpCgpwYXIobWFyPWMoNiw0LDIsMSkpCmJhcnBsb3QoY29sU3VtcyhlbW9jaW9uZXNbLDE6OF0pLAogICAgICAgIGxhcz0yLCBtYWluPSJEaXN0cmlidWNpw7NuIGRlIGVtb2Npb25lcyAoTlJDKSIpCgpzZW50aW1pZW50b3MgPC0gKGVtb2Npb25lcyRuZWdhdGl2ZSAqIC0xKSArIGVtb2Npb25lcyRwb3NpdGl2ZQpzaW1wbGVfcGxvdChzZW50aW1pZW50b3MpCmBgYAoKIyBbKipOdWJlIGRlIFBhbGFicmFzKipde3N0eWxlPSJjb2xvcjogI2ZmMDA3ZjsifQoKYGBge3J9CmxpYnJhcnkod29yZGNsb3VkKQpsaWJyYXJ5KFJDb2xvckJyZXdlcikKbGlicmFyeSh0bSkKCiMgdXNhciB0b2tlbnMgcXVlIHlhIGNhbGN1bGFzdGUKcGFsYWJyYXMgPC0gdG9rZW5zCgojIGxpbXBpYXIgcGFsYWJyYXMgY29tdW5lcwpwYWxhYnJhcyA8LSByZW1vdmVXb3JkcygKICBwYWxhYnJhcywKICBjKHN0b3B3b3JkcygiZW5nbGlzaCIpLAogICAgImZpbG0iLCJtb3ZpZSIsIm9uZSIsImxpa2UiLCJqdXN0IiwicmVhbGx5IiwKICAgICJjYW4iLCJ3aWxsIiwiZ2V0IiwibWFrZSIsInNlZSIsImV2ZW4iLCJhbHNvIikKKQoKc2V0LnNlZWQoMTIzKQoKd29yZGNsb3VkKAogIHdvcmRzID0gcGFsYWJyYXMsCiAgbWF4LndvcmRzID0gNzAsICAgICAKICBtaW4uZnJlcSA9IDYsICAgICAgIAogIHJhbmRvbS5vcmRlciA9IEZBTFNFLAogIHJvdC5wZXIgPSAwLjEsCiAgY29sb3JzID0gYnJld2VyLnBhbCg4LCAiRGFyazIiKQopCmBgYAoKIyBbKipJbmNsdWlyIFBORy9HSUYgZGUgbGFzIHBlbMOtY3VsYXMgbWVuY2lvbmFkYXMqKl17c3R5bGU9ImNvbG9yOiAjZmYwMDdmIn0KCmBgYHtyLCBvdXQud2lkdGg9IjI1MHB4In0Ka25pdHI6OmluY2x1ZGVfZ3JhcGhpY3MoYygKICAiaW1nL2ltYWdlczEuanBnIiwKICAiaW1nL2ltYWdlczIuanBnIiwKICAiaW1nL2ltYWdlczMuanBnIiwKICAiaW1nL2ltYWdlczQuanBnIiwKICAiaW1nL2ltYWdlczUuanBnIgopKQpgYGAKCmBgYHtyfQpsaXN0LmZpbGVzKCJpbWciKQpgYGAKCiMgWyoqQ29uY2x1c2lvbmVzKipde3N0eWxlPSJjb2xvcjogI2ZmMDA3ZjsifQoKQ29uIGxhIGJhc2UgbW92aWVfcmV2aWV3cy5jc3Ygc2UgaWRlbnRpZmljYXJvbiBwZWzDrWN1bGFzIGNvbiBtZWpvciB5CnBlb3IgcGVyY2VwY2nDs24gdXNhbmRvIHNlbnRpbWllbnRvIHByb21lZGlvLCBhZGVtw6FzIGRlIHVuIHJlc3VtZW4gZGUKZW1vY2lvbmVzIChOUkMpIHkgdW5hIG51YmUgZGUgcGFsYWJyYXMgY29uIGxvcyB0w6lybWlub3MgbcOhcyBmcmVjdWVudGVzLgo=