library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Cargando paquete requerido: NLP
##
## Adjuntando el paquete: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Cargando paquete requerido: RColorBrewer
library(RColorBrewer)
file.choose()
## [1] "C:\\Users\\anton\\Documents\\TEC\\CONCENTRACION\\Text mining_Movie reviews.Rmd"
movies <- read_csv("C:\\Users\\anton\\Downloads\\movie_reviews (1).csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 1990
movies_1992 <- movies %>%
filter(year_api == 1990) %>%
filter(!is.na(review_text))
nrow(movies_1992)
## [1] 193
corpus <- VCorpus(VectorSource(movies_1992$review_text))
corpus <- corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stripWhitespace)
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
dim(dtm_matrix)
## [1] 193 2237
set.seed(123)
k <- 4 # puedes cambiar el número de clusters
kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)
movies_1992$cluster <- as.factor(kmeans_model$cluster)
table(movies_1992$cluster)
##
## 1 2 3 4
## 7 15 1 170
dtm <- DocumentTermMatrix(corpus)
dtm_matrix <- as.matrix(dtm)
# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]
# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]
dim(dtm_matrix)
## [1] 193 2221
terms <- colnames(dtm_matrix)
for(i in 1:k){
cat("Cluster", i, "\n")
cluster_docs <- dtm_matrix[movies_1992$cluster == i, , drop = FALSE]
if(nrow(cluster_docs) > 0){
top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
print(top_words)
} else {
cat("No hay documentos en este cluster\n")
}
cat("\n\n")
}
## Cluster 1
## film phantom films horror one like just dead review even
## 128 85 71 68 50 49 40 38 38 35
##
##
## Cluster 2
## film one like review films story life reviews can henry
## 232 102 68 64 62 58 56 55 52 48
##
##
## Cluster 3
## misery novel horror king film like paul one kings much
## 27 24 20 17 16 16 14 13 12 11
##
##
## Cluster 4
## review film movie reviews one
## 710 695 615 526 377
## author responsibility like original story
## 346 342 208 207 206
wordcloud(colnames(dtm_matrix),
colSums(dtm_matrix),
max.words = 100,
colors = brewer.pal(8, "Dark2"))
# Conclusiones Después de observar los
resultados que nos arrojó el código, podemos ver que las palabras más
recurrentes son review, film, movie y reviews. Con esto podemos ver la
homogeneidad que existe en en tema dentro de la base de datos.