# Clustering
# read data file
movies = read.csv("movies.csv")
# Take a look at our data:
str(movies)
## 'data.frame': 1664 obs. of 21 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
## $ Unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Action : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Adventure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Childrens : int 1 0 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 0 1 0 0 0 1 0 0 ...
## $ Crime : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Fantasy : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FilmNoir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SciFi : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Thriller : int 0 1 1 0 1 0 0 0 0 0 ...
## $ War : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...
summary(movies)
## X Title
## Min. : 1.0 'Til There Was You (1997) : 1
## 1st Qu.: 419.8 <c1> k<f6>ldum klaka (Cold Fever) (1994): 1
## Median : 838.5 1-900 (1994) : 1
## Mean : 838.7 101 Dalmatians (1996) : 1
## 3rd Qu.:1258.2 12 Angry Men (1957) : 1
## Max. :1682.0 187 (1997) : 1
## (Other) :1658
## Unknown Action Adventure Animation
## Min. :0.000000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.001202 Mean :0.1496 Mean :0.07993 Mean :0.02524
## 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.0000 Max. :1.00000 Max. :1.00000
##
## Childrens Comedy Crime Documentary
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.07212 Mean :0.3017 Mean :0.0643 Mean :0.03005
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Drama Fantasy FilmNoir Horror
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.4303 Mean :0.01322 Mean :0.01442 Mean :0.05409
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
##
## Musical Mystery Romance SciFi
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.03365 Mean :0.03606 Mean :0.1466 Mean :0.0601
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## Thriller War Western
## Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.00000 Median :0.00000
## Mean :0.149 Mean :0.04267 Mean :0.01623
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.00000 Max. :1.00000
##
head(movies)
## X Title Unknown Action
## 1 1 Toy Story (1995) 0 0
## 2 2 GoldenEye (1995) 0 1
## 3 3 Four Rooms (1995) 0 0
## 4 4 Get Shorty (1995) 0 1
## 5 5 Copycat (1995) 0 0
## 6 6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 0 0
## Adventure Animation Childrens Comedy Crime Documentary Drama Fantasy
## 1 0 1 1 1 0 0 0 0
## 2 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 1 0 0 1 0
## 5 0 0 0 0 1 0 1 0
## 6 0 0 0 0 0 0 1 0
## FilmNoir Horror Musical Mystery Romance SciFi Thriller War Western
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0
## 3 0 0 0 0 0 0 1 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0
## 6 0 0 0 0 0 0 0 0 0
table(movies$Comedy)
##
## 0 1
## 1162 502
prop.table(table(movies$Comedy))
##
## 0 1
## 0.6983173 0.3016827
table(movies$Romance, movies$Drama)
##
## 0 1
## 0 801 619
## 1 147 97
table('Romance' = movies$Romance, 'Drama' = movies$Drama)
## Drama
## Romance 0 1
## 0 801 619
## 1 147 97
prop.table(table('Romance' = movies$Romance, 'Drama' = movies$Drama))
## Drama
## Romance 0 1
## 0 0.48137019 0.37199519
## 1 0.08834135 0.05829327
# Compute distances
distances = dist(movies[3:21], method = "euclidean")
# Hierarchical clustering
clusterMovies = hclust(distances, method = "ward.D")
# Plot the dendrogram
plot(clusterMovies)

## quick questions cut into 2 groups
clusterGroups2 = cutree(clusterMovies, k = 2)
tapply(movies$Drama, clusterGroups2, mean)
## 1 2
## 0.2673879 1.0000000
# use cluster distance to determine no. of clusters
summary(clusterMovies$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.541 0.000 383.700
quantile(clusterMovies$height, c(.25,.50, .75, .90, .99))
## 25% 50% 75% 90% 99%
## 0.000000 0.000000 0.000000 1.400406 29.675344
# display the distance in descending order
tail(sort(clusterMovies$height))
## [1] 79.87443 84.12643 99.16058 149.08957 235.16817 383.73607
atemp = tail(sort(clusterMovies$height),20)
plot(atemp)

# Suppose we use 10. Assign movies to 10 clusters
clusterGroups = cutree(clusterMovies, k = 10)
#Now let's figure out what the clusters are like.
# Use the tapply function to compute the percentage of movies in each genre by cluster
tapply(movies$Action, clusterGroups, mean)
## 1 2 3 4 5 6 7
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000
## 8 9 10
## 0.0000000 0.0000000 0.0000000
round(tapply(movies$Action, clusterGroups, mean),3)
## 1 2 3 4 5 6 7 8 9 10
## 0.178 0.784 0.124 0.000 0.000 0.102 0.000 0.000 0.000 0.000
round(tapply(movies$Romance, clusterGroups, mean),3)
## 1 2 3 4 5 6 7 8 9 10
## 0.104 0.045 0.037 0.000 0.000 1.000 1.000 0.000 0.000 0.000
# Mean of multiple columns for each cluster group
by(movies[,3:21], clusterGroups, colMeans)
## clusterGroups: 1
## Unknown Action Adventure Animation Childrens Comedy
## 0.006734007 0.178451178 0.185185185 0.134680135 0.393939394 0.363636364
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.033670034 0.010101010 0.306397306 0.070707071 0.000000000 0.016835017
## Musical Mystery Romance SciFi Thriller War
## 0.188552189 0.000000000 0.104377104 0.074074074 0.040404040 0.225589226
## Western
## 0.090909091
## --------------------------------------------------------
## clusterGroups: 2
## Unknown Action Adventure Animation Childrens Comedy
## 0.000000000 0.783919598 0.351758794 0.010050251 0.005025126 0.065326633
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.005025126 0.000000000 0.110552764 0.000000000 0.000000000 0.080402010
## Musical Mystery Romance SciFi Thriller War
## 0.000000000 0.000000000 0.045226131 0.346733668 0.376884422 0.015075377
## Western
## 0.000000000
## --------------------------------------------------------
## clusterGroups: 3
## Unknown Action Adventure Animation Childrens Comedy
## 0.000000000 0.123853211 0.036697248 0.000000000 0.009174312 0.064220183
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.412844037 0.000000000 0.380733945 0.004587156 0.105504587 0.018348624
## Musical Mystery Romance SciFi Thriller War
## 0.000000000 0.275229358 0.036697248 0.041284404 0.610091743 0.000000000
## Western
## 0.000000000
## --------------------------------------------------------
## clusterGroups: 4
## Unknown Action Adventure Animation Childrens Comedy
## 0 0 0 0 0 0
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0 0 1 0 0 0
## Musical Mystery Romance SciFi Thriller War
## 0 0 0 0 0 0
## Western
## 0
## --------------------------------------------------------
## clusterGroups: 5
## Unknown Action Adventure Animation Childrens Comedy
## 0 0 0 0 0 1
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0 0 0 0 0 0
## Musical Mystery Romance SciFi Thriller War
## 0 0 0 0 0 0
## Western
## 0
## --------------------------------------------------------
## clusterGroups: 6
## Unknown Action Adventure Animation Childrens Comedy
## 0.0000000 0.1015625 0.0000000 0.0000000 0.0000000 0.1093750
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.0468750 0.0000000 0.6640625 0.0000000 0.0078125 0.0156250
## Musical Mystery Romance SciFi Thriller War
## 0.0000000 0.0000000 1.0000000 0.0000000 0.1406250 0.0000000
## Western
## 0.0000000
## --------------------------------------------------------
## clusterGroups: 7
## Unknown Action Adventure Animation Childrens Comedy
## 0 0 0 0 0 1
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0 0 0 0 0 0
## Musical Mystery Romance SciFi Thriller War
## 0 0 1 0 0 0
## Western
## 0
## --------------------------------------------------------
## clusterGroups: 8
## Unknown Action Adventure Animation Childrens Comedy
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0212766
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## Musical Mystery Romance SciFi Thriller War
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0212766
## Western
## 0.0000000
## --------------------------------------------------------
## clusterGroups: 9
## Unknown Action Adventure Animation Childrens Comedy
## 0 0 0 0 0 1
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0 0 1 0 0 0
## Musical Mystery Romance SciFi Thriller War
## 0 0 0 0 0 0
## Western
## 0
## --------------------------------------------------------
## clusterGroups: 10
## Unknown Action Adventure Animation Childrens Comedy
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.1587302
## Crime Documentary Drama Fantasy FilmNoir Horror
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 1.0000000
## Musical Mystery Romance SciFi Thriller War
## 0.0000000 0.0000000 0.0000000 0.0000000 0.1587302 0.0000000
## Western
## 0.0000000
# Find which cluster Men in Black is in.
subset(movies, Title=="Men in Black (1997)")
## X Title Unknown Action Adventure Animation Childrens
## 257 257 Men in Black (1997) 0 1 1 0 0
## Comedy Crime Documentary Drama Fantasy FilmNoir Horror Musical Mystery
## 257 1 0 0 0 0 0 0 0 0
## Romance SciFi Thriller War Western
## 257 0 1 0 0 0
clusterGroups[257]
## [1] 2
findMenInBlack = subset(movies, Title=="Men in Black (1997)")
clusterGroups[findMenInBlack$X]
## [1] 2
# Create a new data set with just the movies from cluster 2
cluster2 = subset(movies, clusterGroups==2)
# Look at the first 10 titles in this cluster:
cluster2$Title[1:10]
## [1] GoldenEye (1995)
## [2] Bad Boys (1995)
## [3] Apollo 13 (1995)
## [4] Net, The (1995)
## [5] Natural Born Killers (1994)
## [6] Outbreak (1995)
## [7] Stargate (1994)
## [8] Fugitive, The (1993)
## [9] Jurassic Park (1993)
## [10] Robert A. Heinlein's The Puppet Masters (1994)
## 1664 Levels: 'Til There Was You (1997) ...
## K-means
set.seed(123)
movie_km = kmeans(movies[3:21], centers=10)
ls(movie_km)
## [1] "betweenss" "centers" "cluster" "ifault"
## [5] "iter" "size" "tot.withinss" "totss"
## [9] "withinss"
# size of each cluster
movie_km$size
## [1] 88 60 480 522 162 68 143 52 64 25
table(movie_km$cluster)
##
## 1 2 3 4 5 6 7 8 9 10
## 88 60 480 522 162 68 143 52 64 25
round(tapply(movies$Action, movie_km$cluster, mean),3)
## 1 2 3 4 5 6 7 8 9 10
## 0.045 0.067 0.023 0.038 0.315 0.015 0.986 0.192 0.000 0.280