Assignment_week04_June Yao_500316995
1 Movie ratings data
1.1 Data processing[optional]
library(dplyr)
library(tidyr)
ratings <- read.csv("ml-latest-small/ratings.csv", header = TRUE)
movies <- read.csv("ml-latest-small/movies.csv", header = TRUE)
movielens <- left_join(movies, ratings)
top <- movielens %>%
group_by(movieId) %>%
summarize(n=n(), title = first(title)) %>%
top_n(40, n) %>%
pull(movieId)
x <- movielens %>%
filter(movieId %in% top) %>%
group_by(userId) %>%
filter(n() >= 20) %>%
ungroup() %>%
select(title, userId, rating) %>%
spread(userId, rating)
x <- as.data.frame(x)
rownames(x) <- x$title
x$title <- NULL
colnames(x) <- paste0("user_", colnames(x))
write.table(x, row.names = TRUE, col.names = TRUE, sep = ",", file = "movielens_top40.csv")1.2 Data input and IDA
movielens <- read.csv("movielens_top40.csv", header = TRUE)
dim(movielens)
## [1] 40 153
knitr::kable(movielens[1:5,1:5])| user_1 | user_6 | user_7 | user_15 | user_17 | |
|---|---|---|---|---|---|
| Aladdin (1992) | NA | 5 | 3.0 | 3 | NA |
| American Beauty (1999) | 5 | NA | 4.0 | 4 | 4.0 |
| Apollo 13 (1995) | NA | 4 | 4.5 | NA | 3.5 |
| Back to the Future (1985) | 5 | NA | 5.0 | 5 | 4.5 |
| Batman (1989) | 4 | 3 | 3.0 | NA | 4.5 |
1.3 Hierarchical clustering
1.Basic hclust usage
#par(mfrow = c(3, 1))
movielens.dist = dist(movielens)
plot(hclust(movielens.dist),
main = "Complete Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)plot(hclust(movielens.dist, method = "average"),
main = "Average Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)plot(hclust(movielens.dist, method = "single"),
main = "Single Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)2.Form clusters in hclust
Below we separate the movie titles into four clusters:
hc.out = hclust(dist(movielens.dist))
hc.clusters = cutree(hc.out, 4)
knitr::kable(table(hc.clusters),col.names = c("Cluster","Number Of Movies"))| Cluster | Number Of Movies |
|---|---|
| 1 | 12 |
| 2 | 7 |
| 3 | 16 |
| 4 | 5 |
Here we can extract the movies in cluster 1:
## [1] "Aladdin (1992)" "Apollo 13 (1995)"
## [3] "Back to the Future (1985)" "Batman (1989)"
## [5] "Dances with Wolves (1990)" "Fugitive, The (1993)"
## [7] "Gladiator (2000)" "Jurassic Park (1993)"
## [9] "Lion King, The (1994)" "Shrek (2001)"
## [11] "Sixth Sense, The (1999)" "Toy Story (1995)"
Since the fourth largest height is 26.48, we use this height to cut the tree into four clusters.
plot(hc.out,
xlab = "", sub = "", ylab = "", cex = 0.8)
abline(h=h4,col="red")
text(40,h4+2, paste0("h=",round(h4,2)), col = "red")3.Data Transformation
movie.trans<-movielens
movie.trans[is.na(movie.trans)]<-0
movie.trans[movie.trans>0]<-1
knitr::kable(movie.trans[1:5,1:5])| user_1 | user_6 | user_7 | user_15 | user_17 | |
|---|---|---|---|---|---|
| Aladdin (1992) | 0 | 1 | 1 | 1 | 0 |
| American Beauty (1999) | 1 | 0 | 1 | 1 | 1 |
| Apollo 13 (1995) | 0 | 1 | 1 | 0 | 1 |
| Back to the Future (1985) | 1 | 0 | 1 | 1 | 1 |
| Batman (1989) | 1 | 1 | 1 | 0 | 1 |
trans.dist = dist(movie.trans,method="manhattan")
trans.hc.out = hclust(trans.dist)
trans.hc.clusters = cutree(trans.hc.out, 4)
table(trans.hc.clusters)
## trans.hc.clusters
## 1 2 3 4
## 4 10 20 6
trans.hc.out$labels[trans.hc.clusters == 1]
## [1] "Aladdin (1992)" "Apollo 13 (1995)"
## [3] "Dances with Wolves (1990)" "Lion King, The (1994)"
height4=sort(trans.hc.out$height,decreasing = TRUE)[4]
plot(trans.hc.out,
main = "Manhattan Distance With 0/1 Rating", xlab = "", sub = "", ylab = "", cex = 0.8)
abline(h=height4,col="red")
text(40,height4+2, paste0("h=",round(height4,2)), col = "red")1.4 Visulize the data[Optional]
1.5 Comparing trees[Optional]
library(dendextend)
d <- dist(movielens)
# Create two dendrograms
h_avg <- hclust(d, method = "average")
h_single <- hclust(d, method = "single")
dend1 <- as.dendrogram(h_avg)
dend2 <- as.dendrogram(h_single)
# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)
# Compare the two trees
tanglegram(dend_list)1.6 k-means
1.Basic k-means usage
| user_1 | user_6 | user_7 | user_15 | user_17 | |
|---|---|---|---|---|---|
| Aladdin (1992) | 0 | 5 | 3.0 | 3 | 0.0 |
| American Beauty (1999) | 5 | 0 | 4.0 | 4 | 4.0 |
| Apollo 13 (1995) | 0 | 4 | 4.5 | 0 | 3.5 |
| Back to the Future (1985) | 5 | 0 | 5.0 | 5 | 4.5 |
| Batman (1989) | 4 | 3 | 3.0 | 0 | 4.5 |
movie.km <- kmeans(movie.clean, centers = 4)
knitr::kable(table(movie.km$cluster),col.names = c("Cluster","Number Of Movies"))| Cluster | Number Of Movies |
|---|---|
| 1 | 10 |
| 2 | 6 |
| 3 | 9 |
| 4 | 15 |
2.visualize results
Method1
library(ggfortify)
## Loading required package: ggplot2
movie.pr <- prcomp(movie.clean, scale = TRUE)
#plot(movie.pr,type="line")
autoplot(movie.km ,data = movie.clean)+ggtitle("Top40 Movies K-means Visualization")
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.1.7 Cluster statistics
Method1
nk=2:6
WSS<-sapply(nk,function(k){
kmeans(movie.clean,centers = k)$tot.withinss
})
BSS=sapply(nk,function(k){
kmeans(movie.clean,centers = k)$betweenss
})
TSS <- BSS + WSS
ratio <- WSS / TSS
par(mfrow = c(3, 1))
plot(nk,WSS,xlab="number of clusters",ylab = "WSS", main = "Within group SS")
plot(nk,BSS,xlab="number of clusters",ylab = "BSS", main = "Between group SS")
plot(nk,ratio,xlab="number of clusters",ylab = "ratio=WSS/TSS", main = "Between group WSS/TSS")According to ratio=WSS/TSS, the best k is 6.
Method2 (teacher’s resolution)
set.seed(5003)
center.seq <- 2:6
kmeans <- lapply(center.seq, function(x) kmeans(movie.clean, centers = x))
tot.within.ss <- sapply(kmeans, "[[", "tot.withinss")
between.ss <- sapply(kmeans, "[[", "betweenss")
par(mfrow = c(3, 1))
plot(center.seq, tot.within.ss, xlab = "Number of clusters", main = "Within group SS")
plot(center.seq, between.ss, xlab = "Number of clusters", main = "Between group SS")
plot(center.seq, tot.within.ss/(between.ss+tot.within.ss), xlab = "Number of clusters", main = "Between group WSS/TSS") According to ratio=WSS/TSS, the best k is 6.
3 Shiny app to allow the user to explore and decide
click to view the shiny app:
APP. Student Info
Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 4
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: yyao2983@uni.sydney.edu.au