About data

Load the data

movies <- read.table('movielens.txt', header=FALSE, sep='|', quote="\"")
colnames(movies) <- c("ID", "Title", "ReleaseDate", "VideoReleaseDate",
                      "IMDB", "Unknown", "Action", "Adventure",
                      "Animation", "Childrens", "Comedy", "Crime",
                      "Documentary", "Drama", "Fantasy", "FilmHoir",
                      "Horror", "Musical", "Mystery", "Romance",
                      "SciFi", "Thriller", "War", "Western")

Remove unnecessary columns

movies$ID <- NULL
movies$ReleaseDate <- NULL
movies$VideoReleaseDate <- NULL
movies$IMDB <- NULL
movies <- unique(movies)
str(movies)
## 'data.frame':    1664 obs. of  20 variables:
##  $ Title      : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
##  $ Unknown    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Action     : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ Adventure  : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Animation  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Childrens  : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ Comedy     : int  1 0 0 1 0 0 0 1 0 0 ...
##  $ Crime      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Documentary: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Drama      : int  0 0 0 1 1 1 1 1 1 1 ...
##  $ Fantasy    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FilmHoir   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Horror     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Musical    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Mystery    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Romance    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SciFi      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Thriller   : int  0 1 1 0 1 0 0 0 0 0 ...
##  $ War        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Western    : int  0 0 0 0 0 0 0 0 0 0 ...

Simple questions about number of comedy, western, and romance + drama:

(sum(movies$Comedy))
## [1] 502
(sum(movies$Western))
## [1] 27
(sum(movies$Romance & movies$Drama))
## [1] 97

Hierarchical Clustering

Calculate distances between genre features:

distances <- dist(movies[2:20], method="euclidean")
clusterMovies <- hclust(distances, method="ward.D")
plot(clusterMovies)

Label each movie in cluster (with 10 clusters)

clusterGroups <- cutree(clusterMovies, k = 10)

Calculate average “action” genre value for each cluster

tapply(movies$Action, clusterGroups, mean)
##         1         2         3         4         5         6         7 
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000 
##         8         9        10 
## 0.0000000 0.0000000 0.0000000

Find which clusters “Men in Black” is in?

(subset(movies, Title=="Men in Black (1997)"))
##                   Title Unknown Action Adventure Animation Childrens
## 257 Men in Black (1997)       0      1         1         0         0
##     Comedy Crime Documentary Drama Fantasy FilmHoir Horror Musical Mystery
## 257      1     0           0     0       0        0      0       0       0
##     Romance SciFi Thriller War Western
## 257       0     1        0   0       0
mibGrp <- clusterGroups[257]

Find other movies in same cluster as “Men in Black”:

cluster2 <- subset(movies, clusterGroups == mibGrp)
cluster2$Title[1:10]
##  [1] GoldenEye (1995)                              
##  [2] Bad Boys (1995)                               
##  [3] Apollo 13 (1995)                              
##  [4] Net, The (1995)                               
##  [5] Natural Born Killers (1994)                   
##  [6] Outbreak (1995)                               
##  [7] Stargate (1994)                               
##  [8] Fugitive, The (1993)                          
##  [9] Jurassic Park (1993)                          
## [10] Robert A. Heinlein's The Puppet Masters (1994)
## 1664 Levels: 'Til There Was You (1997) ... Zeus and Roxanne (1997)

Other ways to calculate average genre value:

Repeat for each cluster group:

colMeans(subset(movies[2:20], clusterGroups == 1))
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364 
##       Crime Documentary       Drama     Fantasy    FilmHoir      Horror 
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226 
##     Western 
## 0.090909091

Anoth4er way to calculate for cluster 1:

spl <- split(movies[2:20], clusterGroups)
colMeans(spl[[1]])
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364 
##       Crime Documentary       Drama     Fantasy    FilmHoir      Horror 
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226 
##     Western 
## 0.090909091

Even a better way

sapply(spl, colMeans)
##                       1           2           3 4 5         6 7         8
## Unknown     0.006734007 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
## Action      0.178451178 0.783919598 0.123853211 0 0 0.1015625 0 0.0000000
## Adventure   0.185185185 0.351758794 0.036697248 0 0 0.0000000 0 0.0000000
## Animation   0.134680135 0.010050251 0.000000000 0 0 0.0000000 0 0.0000000
## Childrens   0.393939394 0.005025126 0.009174312 0 0 0.0000000 0 0.0000000
## Comedy      0.363636364 0.065326633 0.064220183 0 1 0.1093750 1 0.0212766
## Crime       0.033670034 0.005025126 0.412844037 0 0 0.0468750 0 0.0000000
## Documentary 0.010101010 0.000000000 0.000000000 0 0 0.0000000 0 1.0000000
## Drama       0.306397306 0.110552764 0.380733945 1 0 0.6640625 0 0.0000000
## Fantasy     0.070707071 0.000000000 0.004587156 0 0 0.0000000 0 0.0000000
## FilmHoir    0.000000000 0.000000000 0.105504587 0 0 0.0078125 0 0.0000000
## Horror      0.016835017 0.080402010 0.018348624 0 0 0.0156250 0 0.0000000
## Musical     0.188552189 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
## Mystery     0.000000000 0.000000000 0.275229358 0 0 0.0000000 0 0.0000000
## Romance     0.104377104 0.045226131 0.036697248 0 0 1.0000000 1 0.0000000
## SciFi       0.074074074 0.346733668 0.041284404 0 0 0.0000000 0 0.0000000
## Thriller    0.040404040 0.376884422 0.610091743 0 0 0.1406250 0 0.0000000
## War         0.225589226 0.015075377 0.000000000 0 0 0.0000000 0 0.0212766
## Western     0.090909091 0.000000000 0.000000000 0 0 0.0000000 0 0.0000000
##             9        10
## Unknown     0 0.0000000
## Action      0 0.0000000
## Adventure   0 0.0000000
## Animation   0 0.0000000
## Childrens   0 0.0000000
## Comedy      1 0.1587302
## Crime       0 0.0000000
## Documentary 0 0.0000000
## Drama       1 0.0000000
## Fantasy     0 0.0000000
## FilmHoir    0 0.0000000
## Horror      0 1.0000000
## Musical     0 0.0000000
## Mystery     0 0.0000000
## Romance     0 0.0000000
## SciFi       0 0.0000000
## Thriller    0 0.1587302
## War         0 0.0000000
## Western     0 0.0000000

What happens if 2 clusters are used instead?

clusterGroups <- cutree(clusterMovies, k = 2)
spl <- split(movies[2:20], clusterGroups)
sapply(spl, colMeans)
##                       1 2
## Unknown     0.001545595 0
## Action      0.192426584 0
## Adventure   0.102782071 0
## Animation   0.032457496 0
## Childrens   0.092735703 0
## Comedy      0.387944359 0
## Crime       0.082689335 0
## Documentary 0.038639876 0
## Drama       0.267387944 1
## Fantasy     0.017001546 0
## FilmHoir    0.018547141 0
## Horror      0.069551777 0
## Musical     0.043276662 0
## Mystery     0.046367852 0
## Romance     0.188562597 0
## SciFi       0.077279753 0
## Thriller    0.191653787 0
## War         0.054868624 0
## Western     0.020865533 0