Movies

$Comienzo de las películas de Fox - 20th Century Fox Intro$

Comienzo de las películas de Fox - 20th Century Fox Intro

Cargar librerias

library(pdftools)

## Using poppler version 25.10.0

library(magick)

## Linking to ImageMagick 6.9.13.29
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11

library(tesseract)
library(purrr)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

library(syuzhet)
library(tm)

## Loading required package: NLP

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

Preparar documento

# Ruta del PDF 
pdf_path <- "C:/Users/joseo/Downloads/movie_reviews.csv.pdf"

# 1) Intento 1
pages_txt <- pdftools::pdf_text(pdf_path)
texto_pdf <- paste(pages_txt, collapse = "\n")

# 2) Fallback
if (nchar(str_squish(texto_pdf)) < 200) {
  message("El PDF parece escaneado o sin texto seleccionable. Aplicando OCR...")
  
  # Convertir PDF a imágenes
  imgs <- pdftools::pdf_convert(pdf_path, dpi = 400)
  
  texto_pdf <- imgs %>%
    map_chr(~ ocr(image_read(.x), engine = tesseract("eng"))) %>%
    paste(collapse = "\n")
}

# Vista rápida
cat(substr(texto_pdf, 1, 1500))

##      review_id movie_name          year    reviewer_name        review_text                                                                                                                                                                                                                                                                                                                                  rated         year_api genre                                                            directors                                                                                        writers                                                                                                                                                                                                                                                                                                                                                    actors                                                                                                               plot                                                                                                                                                                                                                                                                  first_genre   first_actor       first_director      first_writer      first_actor_gender   first_director_gender   first_writer_gender
##  0           1 Lethal Weapon 3      1992.0 J. Boyajian

# El PDF es una exportación tipo tabla
texto_limpio <- texto_pdf %>%
  str_replace_all("\u00ad", "") %>%   
  str_replace_all("[\r\t]", " ") %>%
  str_replace_all("\\s+", " ") %>%
  str_squish()

# Quitar encabezados repetidos tipo columnas
texto_limpio <- texto_limpio %>%
  str_replace_all("review_id\\s+movie_name\\s+year\\s+reviewer_name\\s+review_text\\s+rated.*?first_writer_gender", " ")

cat(substr(texto_limpio, 1, 1500))

##   0 1 Lethal Weapon 3 1992.0 J. Boyajian About 20 minutes into LETHAL WEAPON 3, my nephew turned to me andasked, "Does this movie have a plot?" And that question representseverything that is wrong with LW3. Quite frankly, the movie is a messon a number of levels. A *funny* mess, to be sure, but still a mess.As the Bard of Avon would put it, it was fullRof sound and fury,signifying 1992 Action, nothing. Crime, Thriller It was just about everything a bad sequel Richard usuallyis. Donner o Joe Pesci's appearance in the film smacked of dollarsigns. Not Jeffreyjust Boam,because Roberthe'sMark at the moment, "hot"Kamen, Shane Black but because it seemed to exploit his contribution to LW2. His character and performance in LW2 was what made the film his presence as "the Third Stooge" made it distinctive enough from the original. To put him in the third film seemed to be nothing but repetition Melof Gibson, a successful Danny Glover, ingredient Joe Pesciin place of doing something else more MartinStorywise, innovative. Riggs andthere Rogerwas Murtaugh absolutely pursue no reason a former forLAPD him officer to bewhoin theuses o The film.his knowledge character of police of Murtaugh procedureseemed and policies to be to steal and sell confiscated guns and ammunition to local street gangs. Action Mel Gibson Richard Donner Jeffrey Boam female male male 1 2 Lethal Weapon 3 1999.0 Frank Maloney LETHAL WEAPON 3 is a film directed by Richard Donner, written byJeffrey Boam. It stars Mel Gibso

Tokenizar

# 1) Tokenizar
tokens <- get_tokens(texto_limpio)

# 2) Limpieza fuerte para reducir tamaño
tokens <- tolower(tokens)
tokens <- tokens[tokens != ""]
tokens <- tokens[!tokens %in% stopwords("en")]   # quita stopwords EN antes del NRC
tokens <- tokens[nchar(tokens) > 2]              # quita tokens muy cortos (ruido)

set.seed(123)
max_tokens <- 30000
if (length(tokens) > max_tokens) {
  tokens_sample <- sample(tokens, max_tokens)
} else {
  tokens_sample <- tokens
}

# 4) NRC 
emociones <- get_nrc_sentiment(tokens_sample)

# 5) Chequeo
dim(emociones)

## [1] 30000    10

head(emociones)

##   anger anticipation disgust fear joy sadness surprise trust negative positive
## 1     0            0       0    0   0       0        0     0        0        0
## 2     0            0       0    0   0       0        0     0        0        0
## 3     0            0       0    0   0       0        0     0        0        0
## 4     0            0       0    0   0       0        0     0        0        0
## 5     0            0       0    0   0       1        0     0        1        0
## 6     0            1       0    1   0       0        0     0        1        0

Resumen final de los movie reviews

## 1) Resumen Númerico
# Resumen total por emoción
resumen_emociones <- sort(colSums(emociones[, 1:8]), decreasing = TRUE)
resumen_emociones

##        trust anticipation         fear          joy      sadness        anger 
##         1331         1091          996          880          751          749 
##      disgust     surprise 
##          561          561

# Emoción dominante
emocion_top <- names(resumen_emociones)[1]
emocion_top

## [1] "trust"

Conclusión

## 2) Conclusión

cat(
  "Conclusión: En la muestra analizada, la emoción predominante es **",
  emocion_top,
  "**. Esto sugiere que el lenguaje de las reseñas tiende a estar más asociado a esa emoción,",
  "mientras que el resto de emociones aparecen en menor proporción. \n",
  sep = ""
)

## Conclusión: En la muestra analizada, la emoción predominante es **trust**. Esto sugiere que el lenguaje de las reseñas tiende a estar más asociado a esa emoción,mientras que el resto de emociones aparecen en menor proporción.

WordCloud

# Usar los mismos tokens que ya limpiaste
palabras <- tokens_sample   # si usaste la muestra

# Limpiar más 
palabras <- palabras[!palabras %in% stopwords("en")]
palabras <- palabras[nchar(palabras) > 3]   # quitar palabras muy cortas

# Quitar basura típica del dataset
palabras <- removeWords(
  palabras,
  c("film","movie","one","like","will","just","can","make","time",
    "character","characters","story","even","really","much","well")
)

# Wordcloud
set.seed(123)

wordcloud(
  words = palabras,
  min.freq = 10,            
  max.words = 100,
  random.order = FALSE,
  rot.per = 0.2,
  colors = brewer.pal(8, "Dark2")
)

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Análisis del WordCloud

pelicula <- "American Pie"  

# Extraer texto alrededor del nombre
matches <- str_locate_all(texto_pdf, fixed(pelicula))[[1]]

if (nrow(matches) == 0) {
  message("No encontré esa película en el PDF. Revisa el nombre exacto.")
} else {
  ctx <- pmap_chr(as.data.frame(matches), function(start, end) {
    ini <- max(1, start - 800)
    fin <- min(nchar(texto_pdf), end + 800)
    str_sub(texto_pdf, ini, fin)
  })
  
  texto_pelicula <- str_squish(paste(ctx, collapse = "\n"))
  tokens_peli <- get_tokens(texto_pelicula)
  tokens_peli <- tolower(tokens_peli)
  tokens_peli <- tokens_peli[tokens_peli != ""]
  tokens_peli <- tokens_peli[!tokens_peli %in% stopwords("en")]
  tokens_peli <- tokens_peli[nchar(tokens_peli) > 2]
  
  set.seed(1)
  if (length(tokens_peli) > 20000) tokens_peli <- sample(tokens_peli, 20000)
  
  emo_peli <- get_nrc_sentiment(tokens_peli)
  
  barplot(colSums(prop.table(emo_peli[,1:8])),
          main = paste("Emociones (NRC) -", pelicula),
          las = 2)
  
  sent_peli <- (emo_peli$negative * -1) + emo_peli$positive
  simple_plot(sent_peli)
}

Movies

Equipo 5

2026-02-23

Cargar librerias

Preparar documento

Tokenizar

Resumen final de los movie reviews

Conclusión

WordCloud

Análisis del WordCloud