library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
movies <- read_csv("/Users/edu_sssedu/Desktop/Concentración/movie_reviews.csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 2001
movies_2001 <- movies %>%
filter(year_api == 2001) %>%
filter(!is.na(review_text))
nrow(movies_2001)
## [1] 854
corpus <- VCorpus(VectorSource(movies_2001$review_text))
corpus <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace)
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
dim(dtm_matrix)
## [1] 854 8200
set.seed(123)
k <- 3 # puedes cambiar el número de clusters
kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)
movies_2001$cluster <- as.factor(kmeans_model$cluster)
table(movies_2001$cluster)
##
## 1 2 3
## 22 135 697
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]
dim(dtm_matrix)
## [1] 854 8190
terms <- colnames(dtm_matrix)
for(i in 1:k){
cat("Cluster", i, "\n")
cluster_docs <- dtm_matrix[movies_2001$cluster == i, ]
top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
print(top_words)
cat("\n\n")
}
## Cluster 1
## david spielberg film love kubrick review real story
## 191 151 136 117 98 91 80 80
## reviews can
## 79 77
##
##
## Cluster 2
## film movie review
## 847 617 581
## one reviews like
## 547 490 354
## just author responsibility
## 314 275 273
## recartsmoviesreviews
## 271
##
##
## Cluster 3
## review reviews movie
## 2946 2457 2104
## film author responsibility
## 1419 1418 1402
## recartsmoviesreviews one like
## 1395 1354 1078
## original
## 911
wordcloud(colnames(dtm_matrix),
colSums(dtm_matrix),
max.words = 100,
colors = brewer.pal(8, "Dark2"))
El clustering permitió agrupar las reseñas de 2001 según similitudes en su contenido, identificando patrones en las opiniones sin usar una variable previa. Esto demuestra que el aprendizaje no supervisado es útil para analizar texto y detectar tendencias.