Load the data
movies <- read.table('movielens.txt', header=FALSE, sep='|', quote="\"")
colnames(movies) <- c("ID", "Title", "ReleaseDate", "VideoReleaseDate",
"IMDB", "Unknown", "Action", "Adventure",
"Animation", "Childrens", "Comedy", "Crime",
"Documentary", "Drama", "Fantasy", "FilmHoir",
"Horror", "Musical", "Mystery", "Romance",
"SciFi", "Thriller", "War", "Western")
Remove unnecessary columns
movies$ID <- NULL
movies$ReleaseDate <- NULL
movies$VideoReleaseDate <- NULL
movies$IMDB <- NULL
movies <- unique(movies)
str(movies)
## 'data.frame': 1664 obs. of 20 variables:
## $ Title : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
## $ Unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Action : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Adventure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Childrens : int 1 0 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 0 1 0 0 0 1 0 0 ...
## $ Crime : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Fantasy : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FilmHoir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SciFi : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Thriller : int 0 1 1 0 1 0 0 0 0 0 ...
## $ War : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...
Simple questions about number of comedy, western, and romance + drama:
(sum(movies$Comedy))
## [1] 502
(sum(movies$Western))
## [1] 27
(sum(movies$Romance & movies$Drama))
## [1] 97
Calculate distances between genre features:
distances <- dist(movies[2:20], method="euclidean")
clusterMovies <- hclust(distances, method="ward.D")
plot(clusterMovies)
Label each movie in cluster (with 10 clusters)
clusterGroups <- cutree(clusterMovies, k = 10)
Calculate average “action” genre value for each cluster
tapply(movies$Action, clusterGroups, mean)
## 1 2 3 4 5 6 7
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000
## 8 9 10
## 0.0000000 0.0000000 0.0000000
Find which clusters “Men in Black” is in?
(subset(movies, Title=="Men in Black (1997)"))
## Title Unknown Action Adventure Animation Childrens
## 257 Men in Black (1997) 0 1 1 0 0
## Comedy Crime Documentary Drama Fantasy FilmHoir Horror Musical Mystery
## 257 1 0 0 0 0 0 0 0 0
## Romance SciFi Thriller War Western
## 257 0 1 0 0 0
mibGrp <- clusterGroups[257]
Find other movies in same cluster as “Men in Black”:
cluster2 <- subset(movies, clusterGroups == mibGrp)
cluster2$Title[1:10]
## [1] GoldenEye (1995)
## [2] Bad Boys (1995)
## [3] Apollo 13 (1995)
## [4] Net, The (1995)
## [5] Natural Born Killers (1994)
## [6] Outbreak (1995)
## [7] Stargate (1994)
## [8] Fugitive, The (1993)
## [9] Jurassic Park (1993)
## [10] Robert A. Heinlein's The Puppet Masters (1994)
## 1664 Levels: 'Til There Was You (1997) ... Zeus and Roxanne (1997)
Other ways to calculate average genre value:
Repeat for each cluster group:
colMeans(subset(movies[2:20], clusterGroups == 1))
## Unknown Action Adventure Animation Childrens Comedy
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364
## Crime Documentary Drama Fantasy FilmHoir Horror
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017
## Musical Mystery Romance SciFi Thriller War
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226
## Western
## 0.090909091
Anoth4er way to calculate for cluster 1:
spl <- split(movies[2:20], clusterGroups)
colMeans(spl[[1]])
## Unknown Action Adventure Animation Childrens Comedy
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364
## Crime Documentary Drama Fantasy FilmHoir Horror
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017
## Musical Mystery Romance SciFi Thriller War
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226
## Western
## 0.090909091
Even a better way
sapply(spl, colMeans)
## 1 2 3 4 5 6 7 8
## Unknown 0.006734007 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
## Action 0.178451178 0.783919598 0.123853211 0 0 0.1015625 0 0.0000000
## Adventure 0.185185185 0.351758794 0.036697248 0 0 0.0000000 0 0.0000000
## Animation 0.134680135 0.010050251 0.000000000 0 0 0.0000000 0 0.0000000
## Childrens 0.393939394 0.005025126 0.009174312 0 0 0.0000000 0 0.0000000
## Comedy 0.363636364 0.065326633 0.064220183 0 1 0.1093750 1 0.0212766
## Crime 0.033670034 0.005025126 0.412844037 0 0 0.0468750 0 0.0000000
## Documentary 0.010101010 0.000000000 0.000000000 0 0 0.0000000 0 1.0000000
## Drama 0.306397306 0.110552764 0.380733945 1 0 0.6640625 0 0.0000000
## Fantasy 0.070707071 0.000000000 0.004587156 0 0 0.0000000 0 0.0000000
## FilmHoir 0.000000000 0.000000000 0.105504587 0 0 0.0078125 0 0.0000000
## Horror 0.016835017 0.080402010 0.018348624 0 0 0.0156250 0 0.0000000
## Musical 0.188552189 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
## Mystery 0.000000000 0.000000000 0.275229358 0 0 0.0000000 0 0.0000000
## Romance 0.104377104 0.045226131 0.036697248 0 0 1.0000000 1 0.0000000
## SciFi 0.074074074 0.346733668 0.041284404 0 0 0.0000000 0 0.0000000
## Thriller 0.040404040 0.376884422 0.610091743 0 0 0.1406250 0 0.0000000
## War 0.225589226 0.015075377 0.000000000 0 0 0.0000000 0 0.0212766
## Western 0.090909091 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
## 9 10
## Unknown 0 0.0000000
## Action 0 0.0000000
## Adventure 0 0.0000000
## Animation 0 0.0000000
## Childrens 0 0.0000000
## Comedy 1 0.1587302
## Crime 0 0.0000000
## Documentary 0 0.0000000
## Drama 1 0.0000000
## Fantasy 0 0.0000000
## FilmHoir 0 0.0000000
## Horror 0 1.0000000
## Musical 0 0.0000000
## Mystery 0 0.0000000
## Romance 0 0.0000000
## SciFi 0 0.0000000
## Thriller 0 0.1587302
## War 0 0.0000000
## Western 0 0.0000000
What happens if 2 clusters are used instead?
clusterGroups <- cutree(clusterMovies, k = 2)
spl <- split(movies[2:20], clusterGroups)
sapply(spl, colMeans)
## 1 2
## Unknown 0.001545595 0
## Action 0.192426584 0
## Adventure 0.102782071 0
## Animation 0.032457496 0
## Childrens 0.092735703 0
## Comedy 0.387944359 0
## Crime 0.082689335 0
## Documentary 0.038639876 0
## Drama 0.267387944 1
## Fantasy 0.017001546 0
## FilmHoir 0.018547141 0
## Horror 0.069551777 0
## Musical 0.043276662 0
## Mystery 0.046367852 0
## Romance 0.188562597 0
## SciFi 0.077279753 0
## Thriller 0.191653787 0
## War 0.054868624 0
## Western 0.020865533 0