library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Cargando paquete requerido: NLP
## 
## Adjuntando el paquete: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Cargando paquete requerido: RColorBrewer
library(RColorBrewer)

Cargar base de datos

movies <- read_csv("C:/Users/ramir/Downloads/movie_reviews.csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 1992
movies_1992 <- movies %>%
  filter(year_api == 1992) %>%
  filter(!is.na(review_text))

nrow(movies_1992)
## [1] 343

Limpiar texto

corpus <- VCorpus(VectorSource(movies_1992$review_text))

corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(stripWhitespace)

Term Matrix

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

dim(dtm_matrix)
## [1]  343 3659

K-MEANS

set.seed(123)

k <- 3   # puedes cambiar el número de clusters

kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)

movies_1992$cluster <- as.factor(kmeans_model$cluster)

table(movies_1992$cluster)
## 
##   1   2   3 
## 249  11  83

Visualización de clusters

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]

dim(dtm_matrix)
## [1]  343 3648

Palabras del cluster

terms <- colnames(dtm_matrix)

for(i in 1:k){
  
  cat("Cluster", i, "\n")
  
  cluster_docs <- dtm_matrix[movies_1992$cluster == i, ]
  
  top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
  
  print(top_words)
  cat("\n\n")
}
## Cluster 1 
##               review                movie                 film 
##                 1034                  960                  786 
##              reviews                  one               author 
##                  782                  574                  505 
##       responsibility recartsmoviesreviews                 like 
##                  502                  498                  339 
##                 good 
##                  295 
## 
## 
## Cluster 2 
##    film     one   films  horror phantom    even    like  review    just  really 
##     269      95      80      73      73      68      62      61      60      53 
## 
## 
## Cluster 3 
##                 film               review              reviews 
##                  948                  353                  262 
##                  one                movie                 like 
##                  257                  213                  173 
##                films               author recartsmoviesreviews 
##                  171                  169                  167 
##       responsibility 
##                  166

Palabras del cluster

wordcloud(colnames(dtm_matrix),
          colSums(dtm_matrix),
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

Conclusión

El clustering permitió agrupar las reseñas de 1992 según similitudes en su contenido, identificando patrones en las opiniones sin usar una variable previa. Esto demuestra que el aprendizaje no supervisado es útil para analizar texto y detectar tendencias.