1 Movie ratings data

1.1 Data processing[optional]


library(dplyr)
library(tidyr)
ratings <- read.csv("ml-latest-small/ratings.csv", header = TRUE)
movies <- read.csv("ml-latest-small/movies.csv", header = TRUE)
movielens <- left_join(movies, ratings)

top <- movielens %>%
  group_by(movieId) %>%
  summarize(n=n(), title = first(title)) %>%
  top_n(40, n) %>%
  pull(movieId)

x <- movielens %>% 
  filter(movieId %in% top) %>%
  group_by(userId) %>%
  filter(n() >= 20) %>%
  ungroup() %>% 
  select(title, userId, rating) %>%
  spread(userId, rating)
x <- as.data.frame(x)
rownames(x) <- x$title
x$title <- NULL
colnames(x) <- paste0("user_", colnames(x))


write.table(x, row.names = TRUE, col.names = TRUE, sep = ",", file = "movielens_top40.csv")

1.2 Data input and IDA

movielens <- read.csv("movielens_top40.csv", header = TRUE)
dim(movielens) 
## [1]  40 153

knitr::kable(movielens[1:5,1:5])
user_1 user_6 user_7 user_15 user_17
Aladdin (1992) NA 5 3.0 3 NA
American Beauty (1999) 5 NA 4.0 4 4.0
Apollo 13 (1995) NA 4 4.5 NA 3.5
Back to the Future (1985) 5 NA 5.0 5 4.5
Batman (1989) 4 3 3.0 NA 4.5

1.3 Hierarchical clustering

1.Basic hclust usage

#par(mfrow = c(3, 1))
movielens.dist = dist(movielens)
plot(hclust(movielens.dist), 
     main = "Complete Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)

plot(hclust(movielens.dist, method = "average"), 
     main = "Average Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)

plot(hclust(movielens.dist, method = "single"),
     main = "Single Linkage", xlab = "", sub = "", ylab = "", cex = 0.8)

2.Form clusters in hclust

Below we separate the movie titles into four clusters:

hc.out = hclust(dist(movielens.dist))
hc.clusters = cutree(hc.out, 4)
knitr::kable(table(hc.clusters),col.names = c("Cluster","Number Of Movies"))
Cluster Number Of Movies
1 12
2 7
3 16
4 5

Here we can extract the movies in cluster 1:

hc.out$labels[hc.clusters == 1]
##  [1] "Aladdin (1992)"            "Apollo 13 (1995)"         
##  [3] "Back to the Future (1985)" "Batman (1989)"            
##  [5] "Dances with Wolves (1990)" "Fugitive, The (1993)"     
##  [7] "Gladiator (2000)"          "Jurassic Park (1993)"     
##  [9] "Lion King, The (1994)"     "Shrek (2001)"             
## [11] "Sixth Sense, The (1999)"   "Toy Story (1995)"
h4=sort(hc.out$height,decreasing = TRUE)[4]

Since the fourth largest height is 26.48, we use this height to cut the tree into four clusters.

plot(hc.out,
      xlab = "", sub = "", ylab = "", cex = 0.8)
abline(h=h4,col="red")
text(40,h4+2, paste0("h=",round(h4,2)), col = "red")

3.Data Transformation


movie.trans<-movielens 
movie.trans[is.na(movie.trans)]<-0 
movie.trans[movie.trans>0]<-1 
knitr::kable(movie.trans[1:5,1:5])
user_1 user_6 user_7 user_15 user_17
Aladdin (1992) 0 1 1 1 0
American Beauty (1999) 1 0 1 1 1
Apollo 13 (1995) 0 1 1 0 1
Back to the Future (1985) 1 0 1 1 1
Batman (1989) 1 1 1 0 1

trans.dist = dist(movie.trans,method="manhattan")
trans.hc.out = hclust(trans.dist)
trans.hc.clusters = cutree(trans.hc.out, 4)
table(trans.hc.clusters)
## trans.hc.clusters
##  1  2  3  4 
##  4 10 20  6

trans.hc.out$labels[trans.hc.clusters == 1]
## [1] "Aladdin (1992)"            "Apollo 13 (1995)"         
## [3] "Dances with Wolves (1990)" "Lion King, The (1994)"

height4=sort(trans.hc.out$height,decreasing = TRUE)[4]
plot(trans.hc.out,
     main = "Manhattan Distance With 0/1 Rating", xlab = "", sub = "", ylab = "", cex = 0.8)
abline(h=height4,col="red")
text(40,height4+2, paste0("h=",round(height4,2)), col = "red")

1.4 Visulize the data[Optional]

# BiocManager::install("ComplexHeatmap")
# BiocManager::install("shape")
library(ComplexHeatmap)
movielens_matrix <- as.matrix(movielens) 

1.5 Comparing trees[Optional]

library(dendextend)
d <- dist(movielens)
# Create two dendrograms
h_avg <- hclust(d, method = "average")
h_single <- hclust(d, method = "single")
dend1 <- as.dendrogram(h_avg)
dend2 <- as.dendrogram(h_single)

# Create a list to hold dendrograms
dend_list <- dendlist(dend1, dend2)

# Compare the two trees
tanglegram(dend_list)

1.6 k-means

1.Basic k-means usage

movie.clean<-movielens 
movie.clean[is.na(movie.clean)]<-0 
knitr::kable(movie.clean[1:5,1:5])
user_1 user_6 user_7 user_15 user_17
Aladdin (1992) 0 5 3.0 3 0.0
American Beauty (1999) 5 0 4.0 4 4.0
Apollo 13 (1995) 0 4 4.5 0 3.5
Back to the Future (1985) 5 0 5.0 5 4.5
Batman (1989) 4 3 3.0 0 4.5

movie.km <- kmeans(movie.clean, centers = 4)
knitr::kable(table(movie.km$cluster),col.names = c("Cluster","Number Of Movies"))
Cluster Number Of Movies
1 10
2 6
3 9
4 15

2.visualize results

Method1

library(ggfortify)
## Loading required package: ggplot2
movie.pr <- prcomp(movie.clean, scale = TRUE)
#plot(movie.pr,type="line")

autoplot(movie.km ,data = movie.clean)+ggtitle("Top40 Movies K-means Visualization")
## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Method2 (teacher’s resolution)

movie_pc = prcomp(movie.clean, scale = TRUE)
library(gridExtra)
movie.df = data.frame(PC1 = movie_pc$x[,1], PC2 = movie_pc$x[,2], labels = factor(movie.km$cluster))
ggplot(movie.df, aes(PC1, PC2, col = labels)) + geom_point() + theme_minimal()

with(movie.df,plot(movie.df$PC1,movie.df$PC2,col=movie.df$labels))

1.7 Cluster statistics

Method1

nk=2:6
WSS<-sapply(nk,function(k){
    kmeans(movie.clean,centers = k)$tot.withinss
})

BSS=sapply(nk,function(k){
   kmeans(movie.clean,centers = k)$betweenss
})
TSS <- BSS + WSS
ratio <- WSS / TSS

par(mfrow = c(3, 1))
plot(nk,WSS,xlab="number of clusters",ylab = "WSS", main = "Within group SS")
plot(nk,BSS,xlab="number of clusters",ylab = "BSS", main = "Between group SS")
plot(nk,ratio,xlab="number of clusters",ylab = "ratio=WSS/TSS", main = "Between group WSS/TSS")

According to ratio=WSS/TSS, the best k is 6.

Method2 (teacher’s resolution)

set.seed(5003)
center.seq <- 2:6
kmeans <- lapply(center.seq, function(x) kmeans(movie.clean, centers = x))
tot.within.ss <- sapply(kmeans, "[[", "tot.withinss")
between.ss <- sapply(kmeans, "[[", "betweenss")

par(mfrow = c(3, 1))
plot(center.seq, tot.within.ss, xlab = "Number of clusters", main = "Within group SS")
plot(center.seq, between.ss, xlab = "Number of clusters", main = "Between group SS")

plot(center.seq, tot.within.ss/(between.ss+tot.within.ss), xlab = "Number of clusters", main = "Between group WSS/TSS")

According to ratio=WSS/TSS, the best k is 6.

2 Author by word count

2.1 Data input

author.dat <- read.csv("author_count.csv", header = TRUE)
numeric.dat <- author.dat[-1]
authors <- factor(author.dat[[1]])

2.2 PCA

author.pr <- prcomp(numeric.dat, scale = TRUE)
#biplot(author.pr, cex = 0.5)
#plot(author.pr,type="line")
PCA_plot<-autoplot(author.pr, data = author.dat, colour = "author")+ggtitle("PCA")
PCA_plot

2.3 t-SNE

library(Rtsne)
tsne_plot_list <- vector("list", 8) 
for( p in seq(2, 16, by = 2) ){
author.Rtsne <- Rtsne(numeric.dat, perplexity =p)
author.rtsne.df <- data.frame(dim1 = author.Rtsne$Y[,1], dim2 = author.Rtsne$Y[,2], labels = authors)
tsne_plot_list[[p/2]]<-ggplot(author.rtsne.df, aes(dim1, dim2, col = labels)) + geom_point() + ggtitle(paste0("t-SNE (Perplexity = ",p,")"))
}
glist <- lapply(tsne_plot_list, ggplotGrob)
marrangeGrob(glist, nrow = 2, ncol = 2)

2.4 MDS

#trouble here
MDS_list <- vector("list", 6) 
distance=list("euclidean", "maximum", "manhattan", "canberra", "binary","minkowski")

for( m in seq(6) ){
author.dist=dist(numeric.dat,method=distance[[m]])
mds <- cmdscale(author.dist, k = 2); colnames(mds) <- c("x", "y")
mds <- data.frame(mds, author = authors)
MDS_list[[m]]<-ggplot(mds, aes(x = x, y = y, label = author,col=author)) + geom_point()+ theme_minimal()+ggtitle(paste0("MDS (Distance = ",distance[[m]],")"))
}
MDS_glist <- lapply(MDS_list, ggplotGrob)
marrangeGrob(MDS_glist, nrow = 2, ncol = 2)

2.5 Comare and contrast

p1<-PCA_plot
p2<-tsne_plot_list[[4]]
p3<-MDS_list[[4]]
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
multiplot(p1, p2, p3, cols=2)

Compared with other two dimensionality reduction method, t-SNE has much more significant effect.

3 Shiny app to allow the user to explore and decide

click to view the shiny app:

High dimensional visualization and analytics

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 4
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: