Σε αυτή την εργασία εξετάζουμε το dataset MovieLens 100K, συγκεκριμένα το αρχείο u.item, το οποίο περιέχει πληροφορίες για ταινίες (τίτλος, ημερομηνία και genres).
Το dataset χρησιμοποιείται ευρέως σε συστήματα συστάσεων και clustering μεθόδους.
Στόχος είναι: - Να εφαρμόσουμε ιεραρχική συσταδοποίηση - (προαιρετικά) K-means clustering - Να αναλύσουμε τα αποτελέσματα
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
url <- "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
movies <- read.table(url, sep="|", header=FALSE, quote="", fill=TRUE)
colnames(movies) <- c(
"movie_id","title","release_date","video_release_date","IMDb_URL",
"unknown","Action","Adventure","Animation","Children","Comedy","Crime",
"Documentary","Drama","Fantasy","FilmNoir","Horror","Musical","Mystery",
"Romance","SciFi","Thriller","War","Western"
)
head(movies[,1:10])
## movie_id title release_date
## 1 1 Toy Story (1995) 01-Jan-1995
## 2 2 GoldenEye (1995) 01-Jan-1995
## 3 3 Four Rooms (1995) 01-Jan-1995
## 4 4 Get Shorty (1995) 01-Jan-1995
## 5 5 Copycat (1995) 01-Jan-1995
## 6 6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 01-Jan-1995
## video_release_date
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## IMDb_URL unknown Action
## 1 http://us.imdb.com/M/title-exact?Toy%20Story%20(1995) 0 0
## 2 http://us.imdb.com/M/title-exact?GoldenEye%20(1995) 0 1
## 3 http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995) 0 0
## 4 http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995) 0 1
## 5 http://us.imdb.com/M/title-exact?Copycat%20(1995) 0 0
## 6 http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995) 0 0
## Adventure Animation Children
## 1 0 1 1
## 2 1 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
genres <- movies %>% select(Action:Western)
genres_scaled <- scale(genres)
dist_matrix <- dist(genres_scaled)
hc <- hclust(dist_matrix, method = "ward.D2")
plot(hc, labels = FALSE)
clusters_hc <- cutree(hc, k = 5)
table(clusters_hc)
## clusters_hc
## 1 2 3 4 5
## 62 1332 160 50 78
pca <- prcomp(genres_scaled)
plot(pca$x[,1:2],
col = clusters_hc,
pch = 19)
set.seed(123)
kmeans_res <- kmeans(genres_scaled, centers = 5, nstart = 25)
table(kmeans_res$cluster)
##
## 1 2 3 4 5
## 60 42 194 751 635
plot(pca$x[,1:2],
col = kmeans_res$cluster,
pch = 19)
## Συμπεράσματα