library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(syuzhet)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(knitr)
movies <- read.csv("~/Desktop/movie_reviews.csv", stringsAsFactors = FALSE)
movies %>%
select(-review_text) %>%
head(3)
## X review_id movie_name year reviewer_name rated year_api
## 1 0 1 Lethal Weapon 3 1992 J. Boyajian R 1992
## 2 1 2 Lethal Weapon 3 1999 Frank Maloney R 1992
## 3 2 3 Lethal Weapon 3 1995 Brian L. R 1992
## genre directors
## 1 Action, Crime, Thriller Richard Donner
## 2 Action, Crime, Thriller Richard Donner
## 3 Action, Crime, Thriller Richard Donner
## writers
## 1 Jeffrey Boam, Robert Mark Kamen, Shane Black
## 2 Jeffrey Boam, Robert Mark Kamen, Shane Black
## 3 Jeffrey Boam, Robert Mark Kamen, Shane Black
## actors
## 1 Mel Gibson, Danny Glover, Joe Pesci
## 2 Mel Gibson, Danny Glover, Joe Pesci
## 3 Mel Gibson, Danny Glover, Joe Pesci
## plot
## 1 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
## 2 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
## 3 Martin Riggs and Roger Murtaugh pursue a former LAPD officer who uses his knowledge of police procedure and policies to steal and sell confiscated guns and ammunition to local street gangs.
## first_genre first_actor first_director first_writer first_actor_gender
## 1 Action Mel Gibson Richard Donner Jeffrey Boam female
## 2 Action Mel Gibson Richard Donner Jeffrey Boam female
## 3 Action Mel Gibson Richard Donner Jeffrey Boam female
## first_director_gender first_writer_gender
## 1 male male
## 2 male male
## 3 male male
movies2 <- movies %>%
filter(!is.na(review_text), review_text != "") %>%
mutate(
review_text = tolower(review_text),
review_text = gsub("[^a-z\\s']", " ", review_text),
review_text = gsub("\\s+", " ", review_text),
review_text = trimws(review_text)
)
nrow(movies2)
## [1] 18862
movies2$sent_score <- get_sentiment(movies2$review_text, method = "afinn")
pelis_rank <- movies2 %>%
group_by(movie_name) %>%
summarise(
n_reviews = n(),
sent_promedio = mean(sent_score),
sent_mediana = median(sent_score),
.groups = "drop"
) %>%
filter(n_reviews >= 5) %>%
arrange(desc(sent_promedio))
top5 <- pelis_rank %>% slice_head(n = 5)
bottom5 <- pelis_rank %>% arrange(sent_promedio) %>% slice_head(n = 5)
top5
## # A tibble: 5 × 4
## movie_name n_reviews sent_promedio sent_mediana
## <chr> <int> <dbl> <dbl>
## 1 Beauty and the Beast 7 46 41
## 2 Almost Famous 23 44.9 45
## 3 Tous les matins du monde 5 44 45
## 4 Billy's Hollywood Screen Kiss 5 42.6 37
## 5 Wild Man Blues 7 42.3 36
bottom5
## # A tibble: 5 × 4
## movie_name n_reviews sent_promedio sent_mediana
## <chr> <int> <dbl> <dbl>
## 1 Paradise Lost: The Child Murders at Robi… 6 -37.3 -44.5
## 2 Bandit Queen 6 -34.8 -26
## 3 M 6 -27.7 -26
## 4 Jason Goes to Hell: The Final Friday 5 -23.8 -20
## 5 Bully 6 -23.3 -27.5
top10 <- pelis_rank %>% slice_head(n = 10) %>% mutate(tipo = "Top 10")
bot10 <- pelis_rank %>% arrange(sent_promedio) %>% slice_head(n = 10) %>% mutate(tipo = "Bottom 10")
rank_plot <- bind_rows(top10, bot10) %>%
mutate(movie_name = reorder(movie_name, sent_promedio))
ggplot(rank_plot, aes(x = movie_name, y = sent_promedio)) +
geom_col() +
coord_flip() +
facet_wrap(~tipo, scales = "free_y") +
labs(
title = "Mejores y peores películas por sentimiento (AFINN)",
x = "Película",
y = "Sentimiento promedio"
)
# <span style="color:red">Análisis de Emociones y Sentimientos</span>
library(dplyr)
library(syuzhet)
set.seed(1)
# 1) Solo 200 reseñas (súper rápido). Cambia 200 si quieres.
muestra <- movies2 %>% slice_sample(n = min(200, nrow(movies2)))
# 2) Junta texto y recorta (para no procesar demasiado)
texto <- paste(muestra$review_text, collapse = " ")
texto <- substr(texto, 1, 50000) # cambia 50000 si quieres (30k-80k)
# 3) Tokens y emociones
tokens <- get_tokens(texto)
emociones <- get_nrc_sentiment(tokens, language = "english")
par(mar=c(6,4,2,1))
barplot(colSums(emociones[,1:8]),
las=2, main="Distribución de emociones (NRC)")
sentimientos <- (emociones$negative * -1) + emociones$positive
simple_plot(sentimientos)
library(wordcloud)
library(RColorBrewer)
library(tm)
# usar tokens que ya calculaste
palabras <- tokens
# limpiar palabras comunes
palabras <- removeWords(
palabras,
c(stopwords("english"),
"film","movie","one","like","just","really",
"can","will","get","make","see","even","also")
)
set.seed(123)
wordcloud(
words = palabras,
max.words = 70,
min.freq = 6,
random.order = FALSE,
rot.per = 0.1,
colors = brewer.pal(8, "Dark2")
)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
knitr::include_graphics(c(
"img/images1.jpg",
"img/images2.jpg",
"img/images3.jpg",
"img/images4.jpg",
"img/images5.jpg"
))
list.files("img")
## [1] "images1.jpg" "images2.jpg" "images3.jpg" "images4.jpg" "images5.jpg"
Con la base movie_reviews.csv se identificaron películas con mejor y peor percepción usando sentimiento promedio, además de un resumen de emociones (NRC) y una nube de palabras con los términos más frecuentes.