Video 6
setwd('C:/Users/daria.alekseeva/Documents/Edx/Clustering/')
# After following the steps in the video, load the data into R
movies = read.table("movieLens.txt", header=FALSE, sep="|",quote="\"")
str(movies)
## 'data.frame': 1682 obs. of 24 variables:
## $ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ V2 : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
## $ V3 : Factor w/ 241 levels "","01-Aug-1997",..: 71 71 71 71 71 71 71 71 71 182 ...
## $ V4 : logi NA NA NA NA NA NA ...
## $ V5 : Factor w/ 1661 levels "","http://us.imdb.com/M/title-exact/Independence%20(1997)",..: 1431 565 505 543 310 1661 1453 103 357 1183 ...
## $ V6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ V7 : int 0 1 0 1 0 0 0 0 0 0 ...
## $ V8 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ V9 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ V10: int 1 0 0 0 0 0 0 1 0 0 ...
## $ V11: int 1 0 0 1 0 0 0 1 0 0 ...
## $ V12: int 0 0 0 0 1 0 0 0 0 0 ...
## $ V13: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V14: int 0 0 0 1 1 1 1 1 1 1 ...
## $ V15: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V16: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V17: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V18: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V19: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V20: int 0 0 0 0 0 0 0 0 0 0 ...
## $ V21: int 0 0 0 0 0 0 1 0 0 0 ...
## $ V22: int 0 1 1 0 1 0 0 0 0 0 ...
## $ V23: int 0 0 0 0 0 0 0 0 0 1 ...
## $ V24: int 0 0 0 0 0 0 0 0 0 0 ...
# Add column names
colnames(movies) = c("ID", "Title", "ReleaseDate", "VideoReleaseDate", "IMDB", "Unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "FilmNoir", "Horror", "Musical", "Mystery", "Romance", "SciFi", "Thriller", "War", "Western")
str(movies)
## 'data.frame': 1682 obs. of 24 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
## $ ReleaseDate : Factor w/ 241 levels "","01-Aug-1997",..: 71 71 71 71 71 71 71 71 71 182 ...
## $ VideoReleaseDate: logi NA NA NA NA NA NA ...
## $ IMDB : Factor w/ 1661 levels "","http://us.imdb.com/M/title-exact/Independence%20(1997)",..: 1431 565 505 543 310 1661 1453 103 357 1183 ...
## $ Unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Action : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Adventure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Childrens : int 1 0 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 0 1 0 0 0 1 0 0 ...
## $ Crime : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Documentary : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Fantasy : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FilmNoir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SciFi : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Thriller : int 0 1 1 0 1 0 0 0 0 0 ...
## $ War : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...
# Remove unnecessary variables
movies$ID = NULL
movies$ReleaseDate = NULL
movies$VideoReleaseDate = NULL
movies$IMDB = NULL
# Remove duplicates
movies = unique(movies)
# Take a look at our data again:
str(movies)
## 'data.frame': 1664 obs. of 20 variables:
## $ Title : Factor w/ 1664 levels "'Til There Was You (1997)",..: 1525 618 555 594 344 1318 1545 111 391 1240 ...
## $ Unknown : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Action : int 0 1 0 1 0 0 0 0 0 0 ...
## $ Adventure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Animation : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Childrens : int 1 0 0 0 0 0 0 1 0 0 ...
## $ Comedy : int 1 0 0 1 0 0 0 1 0 0 ...
## $ Crime : int 0 0 0 0 1 0 0 0 0 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Drama : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Fantasy : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FilmNoir : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Horror : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Musical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mystery : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SciFi : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Thriller : int 0 1 1 0 1 0 0 0 0 0 ...
## $ War : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Western : int 0 0 0 0 0 0 0 0 0 0 ...
# Video 7
# Compute distances
distances = dist(movies[2:20], method = "euclidean")
# Hierarchical clustering
clusterMovies = hclust(distances, method = "ward.D")
# Plot the dendrogram
plot(clusterMovies)

# Assign points to clusters
clusterGroups = cutree(clusterMovies, k = 10)
#Now let's figure out what the clusters are like.
# Let's use the tapply function to compute the percentage of movies in each genre and cluster
tapply(movies$Action, clusterGroups, mean)
## 1 2 3 4 5 6 7
## 0.1784512 0.7839196 0.1238532 0.0000000 0.0000000 0.1015625 0.0000000
## 8 9 10
## 0.0000000 0.0000000 0.0000000
tapply(movies$Romance, clusterGroups, mean)
## 1 2 3 4 5 6
## 0.10437710 0.04522613 0.03669725 0.00000000 0.00000000 1.00000000
## 7 8 9 10
## 1.00000000 0.00000000 0.00000000 0.00000000
# We can repeat this for each genre. If you do, you get the results in ClusterMeans.ods
# Find which cluster Men in Black is in.
subset(movies, Title=="Men in Black (1997)")
## Title Unknown Action Adventure Animation Childrens
## 257 Men in Black (1997) 0 1 1 0 0
## Comedy Crime Documentary Drama Fantasy FilmNoir Horror Musical Mystery
## 257 1 0 0 0 0 0 0 0 0
## Romance SciFi Thriller War Western
## 257 0 1 0 0 0
clusterGroups[257]
## 257
## 2
# Create a new data set with just the movies from cluster 2
cluster2 = subset(movies, clusterGroups==2)
# Look at the first 10 titles in this cluster:
cluster2$Title[1:10]
## [1] GoldenEye (1995)
## [2] Bad Boys (1995)
## [3] Apollo 13 (1995)
## [4] Net, The (1995)
## [5] Natural Born Killers (1994)
## [6] Outbreak (1995)
## [7] Stargate (1994)
## [8] Fugitive, The (1993)
## [9] Jurassic Park (1993)
## [10] Robert A. Heinlein's The Puppet Masters (1994)
## 1664 Levels: 'Til There Was You (1997) ... Zeus and Roxanne (1997)