library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tm)
## Cargando paquete requerido: NLP
## 
## Adjuntando el paquete: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wordcloud)
## Cargando paquete requerido: RColorBrewer
library(RColorBrewer)
file.choose()
## [1] "C:\\Users\\anton\\Documents\\TEC\\CONCENTRACION\\Text mining_Movie reviews.Rmd"

Cargar base de datos

movies <- read_csv("C:\\Users\\anton\\Downloads\\movie_reviews (1).csv")
## New names:
## Rows: 18862 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (16): movie_name, reviewer_name, review_text, rated, genre, directors, w... dbl
## (4): ...1, review_id, year, year_api
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Filtrar año 1990
movies_1992 <- movies %>%
  filter(year_api == 1990) %>%
  filter(!is.na(review_text))

nrow(movies_1992)
## [1] 193

Limpiar texto

corpus <- VCorpus(VectorSource(movies_1992$review_text))

corpus <- corpus %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(stripWhitespace)

Term Matrix

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Eliminar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

dim(dtm_matrix)
## [1]  193 2237

K-MEANS

set.seed(123)

k <- 4   # puedes cambiar el número de clusters

kmeans_model <- kmeans(dtm_matrix, centers = k, nstart = 25)

movies_1992$cluster <- as.factor(kmeans_model$cluster)

table(movies_1992$cluster)
## 
##   1   2   3   4 
##   7  15   1 170

Visualización de clusters

dtm <- DocumentTermMatrix(corpus)

dtm_matrix <- as.matrix(dtm)

# Quitar palabras poco frecuentes
dtm_matrix <- dtm_matrix[, colSums(dtm_matrix) > 5]

# Quitar columnas con varianza cero
dtm_matrix <- dtm_matrix[, apply(dtm_matrix, 2, var) != 0]

dim(dtm_matrix)
## [1]  193 2221

Palabras del cluster

terms <- colnames(dtm_matrix)

for(i in 1:k){
  
  cat("Cluster", i, "\n")
  
  cluster_docs <- dtm_matrix[movies_1992$cluster == i, , drop = FALSE]
  
  if(nrow(cluster_docs) > 0){
    top_words <- sort(colSums(cluster_docs), decreasing = TRUE)[1:10]
    print(top_words)
  } else {
    cat("No hay documentos en este cluster\n")
  }
  
  cat("\n\n")
}
## Cluster 1 
##    film phantom   films  horror     one    like    just    dead  review    even 
##     128      85      71      68      50      49      40      38      38      35 
## 
## 
## Cluster 2 
##    film     one    like  review   films   story    life reviews     can   henry 
##     232     102      68      64      62      58      56      55      52      48 
## 
## 
## Cluster 3 
## misery  novel horror   king   film   like   paul    one  kings   much 
##     27     24     20     17     16     16     14     13     12     11 
## 
## 
## Cluster 4 
##         review           film          movie        reviews            one 
##            710            695            615            526            377 
##         author responsibility           like       original          story 
##            346            342            208            207            206

Palabras del cluster

wordcloud(colnames(dtm_matrix),
          colSums(dtm_matrix),
          max.words = 100,
          colors = brewer.pal(8, "Dark2"))

# Conclusiones Después de observar los resultados que nos arrojó el código, podemos ver que las palabras más recurrentes son review, film, movie y reviews. Con esto podemos ver la homogeneidad que existe en en tema dentro de la base de datos.