library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Cargando paquete requerido: NLP
##
## Adjuntando el paquete: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Cargando paquete requerido: RColorBrewer
library(RColorBrewer)
movies <- read_csv("C:/Users/ramir/Downloads/movie_reviews.csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 1992
movies_1992 <- movies %>%
filter(year_api == 1992) %>%
filter(!is.na(review_text))
nrow(movies_1992)
## [1] 343
corpus <- VCorpus(VectorSource(movies_1992$review_text))
corpus <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace)
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
dim(dtm_matrix)
## [1] 343 3659
set.seed(123)
k <- 3 # puedes cambiar el número de clusters
kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)
movies_1992$cluster <- as.factor(kmeans_model$cluster)
table(movies_1992$cluster)
##
## 1 2 3
## 249 11 83
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]
dim(dtm_matrix)
## [1] 343 3648
terms <- colnames(dtm_matrix)
for(i in 1:k){
cat("Cluster", i, "\n")
cluster_docs <- dtm_matrix[movies_1992$cluster == i, ]
top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
print(top_words)
cat("\n\n")
}
## Cluster 1
## review movie film
## 1034 960 786
## reviews one author
## 782 574 505
## responsibility recartsmoviesreviews like
## 502 498 339
## good
## 295
##
##
## Cluster 2
## film one films horror phantom even like review just really
## 269 95 80 73 73 68 62 61 60 53
##
##
## Cluster 3
## film review reviews
## 948 353 262
## one movie like
## 257 213 173
## films author recartsmoviesreviews
## 171 169 167
## responsibility
## 166
wordcloud(colnames(dtm_matrix),
colSums(dtm_matrix),
max.words = 100,
colors = brewer.pal(8, "Dark2"))
El clustering permitió agrupar las reseñas de 1992 según similitudes en su contenido, identificando patrones en las opiniones sin usar una variable previa. Esto demuestra que el aprendizaje no supervisado es útil para analizar texto y detectar tendencias.