Comienzo de las películas de Fox - 20th Century
Fox Intro
Cargar librerias
library(pdftools)
## Using poppler version 25.10.0
library(magick)
## Linking to ImageMagick 6.9.13.29
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(tesseract)
library(purrr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(syuzhet)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
Preparar documento
# Ruta del PDF
pdf_path <- "C:/Users/joseo/Downloads/movie_reviews.csv.pdf"
# 1) Intento 1
pages_txt <- pdftools::pdf_text(pdf_path)
texto_pdf <- paste(pages_txt, collapse = "\n")
# 2) Fallback
if (nchar(str_squish(texto_pdf)) < 200) {
message("El PDF parece escaneado o sin texto seleccionable. Aplicando OCR...")
# Convertir PDF a imágenes
imgs <- pdftools::pdf_convert(pdf_path, dpi = 400)
texto_pdf <- imgs %>%
map_chr(~ ocr(image_read(.x), engine = tesseract("eng"))) %>%
paste(collapse = "\n")
}
# Vista rápida
cat(substr(texto_pdf, 1, 1500))
## review_id movie_name year reviewer_name review_text rated year_api genre directors writers actors plot first_genre first_actor first_director first_writer first_actor_gender first_director_gender first_writer_gender
## 0 1 Lethal Weapon 3 1992.0 J. Boyajian
# El PDF es una exportación tipo tabla
texto_limpio <- texto_pdf %>%
str_replace_all("\u00ad", "") %>%
str_replace_all("[\r\t]", " ") %>%
str_replace_all("\\s+", " ") %>%
str_squish()
# Quitar encabezados repetidos tipo columnas
texto_limpio <- texto_limpio %>%
str_replace_all("review_id\\s+movie_name\\s+year\\s+reviewer_name\\s+review_text\\s+rated.*?first_writer_gender", " ")
cat(substr(texto_limpio, 1, 1500))
## 0 1 Lethal Weapon 3 1992.0 J. Boyajian About 20 minutes into LETHAL WEAPON 3, my nephew turned to me andasked, "Does this movie have a plot?" And that question representseverything that is wrong with LW3. Quite frankly, the movie is a messon a number of levels. A *funny* mess, to be sure, but still a mess.As the Bard of Avon would put it, it was fullRof sound and fury,signifying 1992 Action, nothing. Crime, Thriller It was just about everything a bad sequel Richard usuallyis. Donner o Joe Pesci's appearance in the film smacked of dollarsigns. Not Jeffreyjust Boam,because Roberthe'sMark at the moment, "hot"Kamen, Shane Black but because it seemed to exploit his contribution to LW2. His character and performance in LW2 was what made the film his presence as "the Third Stooge" made it distinctive enough from the original. To put him in the third film seemed to be nothing but repetition Melof Gibson, a successful Danny Glover, ingredient Joe Pesciin place of doing something else more MartinStorywise, innovative. Riggs andthere Rogerwas Murtaugh absolutely pursue no reason a former forLAPD him officer to bewhoin theuses o The film.his knowledge character of police of Murtaugh procedureseemed and policies to be to steal and sell confiscated guns and ammunition to local street gangs. Action Mel Gibson Richard Donner Jeffrey Boam female male male 1 2 Lethal Weapon 3 1999.0 Frank Maloney LETHAL WEAPON 3 is a film directed by Richard Donner, written byJeffrey Boam. It stars Mel Gibso
Tokenizar
# 1) Tokenizar
tokens <- get_tokens(texto_limpio)
# 2) Limpieza fuerte para reducir tamaño
tokens <- tolower(tokens)
tokens <- tokens[tokens != ""]
tokens <- tokens[!tokens %in% stopwords("en")] # quita stopwords EN antes del NRC
tokens <- tokens[nchar(tokens) > 2] # quita tokens muy cortos (ruido)
set.seed(123)
max_tokens <- 30000
if (length(tokens) > max_tokens) {
tokens_sample <- sample(tokens, max_tokens)
} else {
tokens_sample <- tokens
}
# 4) NRC
emociones <- get_nrc_sentiment(tokens_sample)
# 5) Chequeo
dim(emociones)
## [1] 30000 10
head(emociones)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 1 0 0 1 0
## 6 0 1 0 1 0 0 0 0 1 0
Resumen final de los movie
reviews
## 1) Resumen Númerico
# Resumen total por emoción
resumen_emociones <- sort(colSums(emociones[, 1:8]), decreasing = TRUE)
resumen_emociones
## trust anticipation fear joy sadness anger
## 1331 1091 996 880 751 749
## disgust surprise
## 561 561
# Emoción dominante
emocion_top <- names(resumen_emociones)[1]
emocion_top
## [1] "trust"
Conclusión
## 2) Conclusión
cat(
"Conclusión: En la muestra analizada, la emoción predominante es **",
emocion_top,
"**. Esto sugiere que el lenguaje de las reseñas tiende a estar más asociado a esa emoción,",
"mientras que el resto de emociones aparecen en menor proporción. \n",
sep = ""
)
## Conclusión: En la muestra analizada, la emoción predominante es **trust**. Esto sugiere que el lenguaje de las reseñas tiende a estar más asociado a esa emoción,mientras que el resto de emociones aparecen en menor proporción.
WordCloud
# Usar los mismos tokens que ya limpiaste
palabras <- tokens_sample # si usaste la muestra
# Limpiar más
palabras <- palabras[!palabras %in% stopwords("en")]
palabras <- palabras[nchar(palabras) > 3] # quitar palabras muy cortas
# Quitar basura típica del dataset
palabras <- removeWords(
palabras,
c("film","movie","one","like","will","just","can","make","time",
"character","characters","story","even","really","much","well")
)
# Wordcloud
set.seed(123)
wordcloud(
words = palabras,
min.freq = 10,
max.words = 100,
random.order = FALSE,
rot.per = 0.2,
colors = brewer.pal(8, "Dark2")
)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Análisis del WordCloud
pelicula <- "American Pie"
# Extraer texto alrededor del nombre
matches <- str_locate_all(texto_pdf, fixed(pelicula))[[1]]
if (nrow(matches) == 0) {
message("No encontré esa película en el PDF. Revisa el nombre exacto.")
} else {
ctx <- pmap_chr(as.data.frame(matches), function(start, end) {
ini <- max(1, start - 800)
fin <- min(nchar(texto_pdf), end + 800)
str_sub(texto_pdf, ini, fin)
})
texto_pelicula <- str_squish(paste(ctx, collapse = "\n"))
tokens_peli <- get_tokens(texto_pelicula)
tokens_peli <- tolower(tokens_peli)
tokens_peli <- tokens_peli[tokens_peli != ""]
tokens_peli <- tokens_peli[!tokens_peli %in% stopwords("en")]
tokens_peli <- tokens_peli[nchar(tokens_peli) > 2]
set.seed(1)
if (length(tokens_peli) > 20000) tokens_peli <- sample(tokens_peli, 20000)
emo_peli <- get_nrc_sentiment(tokens_peli)
barplot(colSums(prop.table(emo_peli[,1:8])),
main = paste("Emociones (NRC) -", pelicula),
las = 2)
sent_peli <- (emo_peli$negative * -1) + emo_peli$positive
simple_plot(sent_peli)
}

