library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

Cargar base de datos

movies <- read_csv("/Users/edu_sssedu/Desktop/Concentración/movie_reviews.csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 2001
movies_2001 <- movies %>%
  filter(year_api == 2001) %>%
  filter(!is.na(review_text))

nrow(movies_2001)
## [1] 854

Limpiar texto

corpus <- VCorpus(VectorSource(movies_2001$review_text))

corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(stripWhitespace)

Term Matrix

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

dim(dtm_matrix)
## [1]  854 8200

K-MEANS

set.seed(123)

k <- 3   # puedes cambiar el número de clusters

kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)

movies_2001$cluster <- as.factor(kmeans_model$cluster)

table(movies_2001$cluster)
## 
##   1   2   3 
##  22 135 697

Visualización de clusters

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]

dim(dtm_matrix)
## [1]  854 8190

Palabras del cluster

terms <- colnames(dtm_matrix)

for(i in 1:k){
  
  cat("Cluster", i, "\n")
  
  cluster_docs <- dtm_matrix[movies_2001$cluster == i, ]
  
  top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
  
  print(top_words)
  cat("\n\n")
}
## Cluster 1 
##     david spielberg      film      love   kubrick    review      real     story 
##       191       151       136       117        98        91        80        80 
##   reviews       can 
##        79        77 
## 
## 
## Cluster 2 
##                 film                movie               review 
##                  847                  617                  581 
##                  one              reviews                 like 
##                  547                  490                  354 
##                 just               author       responsibility 
##                  314                  275                  273 
## recartsmoviesreviews 
##                  271 
## 
## 
## Cluster 3 
##               review              reviews                movie 
##                 2946                 2457                 2104 
##                 film               author       responsibility 
##                 1419                 1418                 1402 
## recartsmoviesreviews                  one                 like 
##                 1395                 1354                 1078 
##             original 
##                  911

Palabras del cluster

wordcloud(colnames(dtm_matrix),
          colSums(dtm_matrix),
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

Conclusión

El clustering permitió agrupar las reseñas de 2001 según similitudes en su contenido, identificando patrones en las opiniones sin usar una variable previa. Esto demuestra que el aprendizaje no supervisado es útil para analizar texto y detectar tendencias.