# Clustering
# read data file

movies = read.csv("movies.csv")

# Take a look at our data:
str(movies)
## 'data.frame':    1664 obs. of  21 variables:
##  $ X          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title      : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
##  $ Unknown    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Action     : int  0 1 0 1 0 0 0 0 0 0 ...
##  $ Adventure  : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Animation  : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Childrens  : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ Comedy     : int  1 0 0 1 0 0 0 1 0 0 ...
##  $ Crime      : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ Documentary: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Drama      : int  0 0 0 1 1 1 1 1 1 1 ...
##  $ Fantasy    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ FilmNoir   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Horror     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Musical    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Mystery    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Romance    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SciFi      : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Thriller   : int  0 1 1 0 1 0 0 0 0 0 ...
##  $ War        : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Western    : int  0 0 0 0 0 0 0 0 0 0 ...
summary(movies)
##        X                                               Title     
##  Min.   :   1.0   'Til There Was You (1997)               :   1  
##  1st Qu.: 419.8   <c1> k<f6>ldum klaka (Cold Fever) (1994):   1  
##  Median : 838.5   1-900 (1994)                            :   1  
##  Mean   : 838.7   101 Dalmatians (1996)                   :   1  
##  3rd Qu.:1258.2   12 Angry Men (1957)                     :   1  
##  Max.   :1682.0   187 (1997)                              :   1  
##                   (Other)                                 :1658  
##     Unknown             Action         Adventure         Animation      
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.001202   Mean   :0.1496   Mean   :0.07993   Mean   :0.02524  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##                                                                         
##    Childrens           Comedy           Crime         Documentary     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.07212   Mean   :0.3017   Mean   :0.0643   Mean   :0.03005  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                       
##      Drama           Fantasy           FilmNoir           Horror       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.4303   Mean   :0.01322   Mean   :0.01442   Mean   :0.05409  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##                                                                        
##     Musical           Mystery           Romance           SciFi       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.03365   Mean   :0.03606   Mean   :0.1466   Mean   :0.0601  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                       
##     Thriller          War             Western       
##  Min.   :0.000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000   Median :0.00000   Median :0.00000  
##  Mean   :0.149   Mean   :0.04267   Mean   :0.01623  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.00000  
## 
head(movies)
##   X                                                Title Unknown Action
## 1 1                                     Toy Story (1995)       0      0
## 2 2                                     GoldenEye (1995)       0      1
## 3 3                                    Four Rooms (1995)       0      0
## 4 4                                    Get Shorty (1995)       0      1
## 5 5                                       Copycat (1995)       0      0
## 6 6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)       0      0
##   Adventure Animation Childrens Comedy Crime Documentary Drama Fantasy
## 1         0         1         1      1     0           0     0       0
## 2         1         0         0      0     0           0     0       0
## 3         0         0         0      0     0           0     0       0
## 4         0         0         0      1     0           0     1       0
## 5         0         0         0      0     1           0     1       0
## 6         0         0         0      0     0           0     1       0
##   FilmNoir Horror Musical Mystery Romance SciFi Thriller War Western
## 1        0      0       0       0       0     0        0   0       0
## 2        0      0       0       0       0     0        1   0       0
## 3        0      0       0       0       0     0        1   0       0
## 4        0      0       0       0       0     0        0   0       0
## 5        0      0       0       0       0     0        1   0       0
## 6        0      0       0       0       0     0        0   0       0
table(movies$Comedy)
## 
##    0    1 
## 1162  502
prop.table(table(movies$Comedy))
## 
##         0         1 
## 0.6983173 0.3016827
table(movies$Romance, movies$Drama)
##    
##       0   1
##   0 801 619
##   1 147  97
table('Romance' = movies$Romance, 'Drama' = movies$Drama)
##        Drama
## Romance   0   1
##       0 801 619
##       1 147  97
prop.table(table('Romance' = movies$Romance, 'Drama' = movies$Drama))
##        Drama
## Romance          0          1
##       0 0.48137019 0.37199519
##       1 0.08834135 0.05829327
# Compute distances
distances = dist(movies[3:21], method = "euclidean")

# Hierarchical clustering
clusterMovies = hclust(distances, method = "ward.D") 

# Plot the dendrogram
plot(clusterMovies)

## quick questions cut into 2 groups

clusterGroups2 = cutree(clusterMovies, k = 2)
tapply(movies$Drama, clusterGroups2, mean)
##         1         2 
## 0.2673879 1.0000000
# use cluster distance to determine no. of clusters
summary(clusterMovies$height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.541   0.000 383.700
quantile(clusterMovies$height, c(.25,.50, .75, .90, .99))
##       25%       50%       75%       90%       99% 
##  0.000000  0.000000  0.000000  1.400406 29.675344
# display the distance in descending order
tail(sort(clusterMovies$height))
## [1]  79.87443  84.12643  99.16058 149.08957 235.16817 383.73607
atemp = tail(sort(clusterMovies$height),20)
plot(atemp)

# Suppose we use 10. Assign movies to 10 clusters
clusterGroups = cutree(clusterMovies, k = 10)

#Now let's figure out what the clusters are like.
# Use the tapply function to compute the percentage of movies in each genre by cluster

tapply(movies$Action, clusterGroups, mean)
##         1         2         3         4         5         6         7 
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000 
##         8         9        10 
## 0.0000000 0.0000000 0.0000000
round(tapply(movies$Action, clusterGroups, mean),3)
##     1     2     3     4     5     6     7     8     9    10 
## 0.178 0.784 0.124 0.000 0.000 0.102 0.000 0.000 0.000 0.000
round(tapply(movies$Romance, clusterGroups, mean),3)
##     1     2     3     4     5     6     7     8     9    10 
## 0.104 0.045 0.037 0.000 0.000 1.000 1.000 0.000 0.000 0.000
# Mean of multiple columns for each cluster group
by(movies[,3:21], clusterGroups, colMeans)
## clusterGroups: 1
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226 
##     Western 
## 0.090909091 
## -------------------------------------------------------- 
## clusterGroups: 2
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.000000000 0.783919598 0.351758794 0.010050251 0.005025126 0.065326633 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
## 0.005025126 0.000000000 0.110552764 0.000000000 0.000000000 0.080402010 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.000000000 0.000000000 0.045226131 0.346733668 0.376884422 0.015075377 
##     Western 
## 0.000000000 
## -------------------------------------------------------- 
## clusterGroups: 3
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
## 0.000000000 0.123853211 0.036697248 0.000000000 0.009174312 0.064220183 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
## 0.412844037 0.000000000 0.380733945 0.004587156 0.105504587 0.018348624 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
## 0.000000000 0.275229358 0.036697248 0.041284404 0.610091743 0.000000000 
##     Western 
## 0.000000000 
## -------------------------------------------------------- 
## clusterGroups: 4
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##           0           0           0           0           0           0 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##           0           0           1           0           0           0 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##           0           0           0           0           0           0 
##     Western 
##           0 
## -------------------------------------------------------- 
## clusterGroups: 5
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##           0           0           0           0           0           1 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##           0           0           0           0           0           0 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##           0           0           0           0           0           0 
##     Western 
##           0 
## -------------------------------------------------------- 
## clusterGroups: 6
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##   0.0000000   0.1015625   0.0000000   0.0000000   0.0000000   0.1093750 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##   0.0468750   0.0000000   0.6640625   0.0000000   0.0078125   0.0156250 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##   0.0000000   0.0000000   1.0000000   0.0000000   0.1406250   0.0000000 
##     Western 
##   0.0000000 
## -------------------------------------------------------- 
## clusterGroups: 7
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##           0           0           0           0           0           1 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##           0           0           0           0           0           0 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##           0           0           1           0           0           0 
##     Western 
##           0 
## -------------------------------------------------------- 
## clusterGroups: 8
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000   0.0212766 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##   0.0000000   1.0000000   0.0000000   0.0000000   0.0000000   0.0000000 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000   0.0212766 
##     Western 
##   0.0000000 
## -------------------------------------------------------- 
## clusterGroups: 9
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##           0           0           0           0           0           1 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##           0           0           1           0           0           0 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##           0           0           0           0           0           0 
##     Western 
##           0 
## -------------------------------------------------------- 
## clusterGroups: 10
##     Unknown      Action   Adventure   Animation   Childrens      Comedy 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000   0.1587302 
##       Crime Documentary       Drama     Fantasy    FilmNoir      Horror 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.0000000   1.0000000 
##     Musical     Mystery     Romance       SciFi    Thriller         War 
##   0.0000000   0.0000000   0.0000000   0.0000000   0.1587302   0.0000000 
##     Western 
##   0.0000000
# Find which cluster Men in Black is in.

subset(movies, Title=="Men in Black (1997)")
##       X               Title Unknown Action Adventure Animation Childrens
## 257 257 Men in Black (1997)       0      1         1         0         0
##     Comedy Crime Documentary Drama Fantasy FilmNoir Horror Musical Mystery
## 257      1     0           0     0       0        0      0       0       0
##     Romance SciFi Thriller War Western
## 257       0     1        0   0       0
clusterGroups[257]
## [1] 2
findMenInBlack =  subset(movies, Title=="Men in Black (1997)")
clusterGroups[findMenInBlack$X]
## [1] 2
# Create a new data set with just the movies from cluster 2
cluster2 = subset(movies, clusterGroups==2)

# Look at the first 10 titles in this cluster:
cluster2$Title[1:10]
##  [1] GoldenEye (1995)                              
##  [2] Bad Boys (1995)                               
##  [3] Apollo 13 (1995)                              
##  [4] Net, The (1995)                               
##  [5] Natural Born Killers (1994)                   
##  [6] Outbreak (1995)                               
##  [7] Stargate (1994)                               
##  [8] Fugitive, The (1993)                          
##  [9] Jurassic Park (1993)                          
## [10] Robert A. Heinlein's The Puppet Masters (1994)
## 1664 Levels: 'Til There Was You (1997) ...
## K-means

set.seed(123)
movie_km = kmeans(movies[3:21], centers=10)
ls(movie_km)
## [1] "betweenss"    "centers"      "cluster"      "ifault"      
## [5] "iter"         "size"         "tot.withinss" "totss"       
## [9] "withinss"
# size of each cluster
movie_km$size
##  [1]  88  60 480 522 162  68 143  52  64  25
table(movie_km$cluster)
## 
##   1   2   3   4   5   6   7   8   9  10 
##  88  60 480 522 162  68 143  52  64  25
round(tapply(movies$Action, movie_km$cluster, mean),3)
##     1     2     3     4     5     6     7     8     9    10 
## 0.045 0.067 0.023 0.038 0.315 0.015 0.986 0.192 0.000 0.280