library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 4")
#1.2
movie_len <- read.csv('movielens_top40.csv')
dim(movie_len)
## [1] 40 153
#1.3
d <- dist(movie_len)
h <-hclust(d)
plot(h, cex = 0.2)
h_avg <- hclust(d, method = 'average')
plot(h_avg, cex = 0.3)
h_comp <- hclust(d, method = 'complete')
plot(h_comp, cex = 0.3)
h_sing <- hclust(d, method = 'single')
plot(h_sing, cex = 0.3)
tree_1 <- cutree(h_avg, k = 4)
head(tree_1)
## Aladdin (1992) American Beauty (1999) Apollo 13 (1995)
## 1 2 1
## Back to the Future (1985) Batman (1989) Braveheart (1995)
## 1 1 2
split(names(tree_1), tree_1)
## $`1`
## [1] "Aladdin (1992)" "Apollo 13 (1995)"
## [3] "Back to the Future (1985)" "Batman (1989)"
## [5] "Dances with Wolves (1990)" "Fugitive, The (1993)"
## [7] "Jurassic Park (1993)" "Lion King, The (1994)"
## [9] "Men in Black (a.k.a. MIB) (1997)" "Shrek (2001)"
## [11] "Toy Story (1995)"
##
## $`2`
## [1] "American Beauty (1999)"
## [2] "Braveheart (1995)"
## [3] "Fargo (1996)"
## [4] "Fight Club (1999)"
## [5] "Forrest Gump (1994)"
## [6] "Godfather, The (1972)"
## [7] "Lord of the Rings: The Fellowship of the Ring, The (2001)"
## [8] "Lord of the Rings: The Return of the King, The (2003)"
## [9] "Lord of the Rings: The Two Towers, The (2002)"
## [10] "Matrix, The (1999)"
## [11] "Pulp Fiction (1994)"
## [12] "Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)"
## [13] "Saving Private Ryan (1998)"
## [14] "Schindler's List (1993)"
## [15] "Seven (a.k.a. Se7en) (1995)"
## [16] "Shawshank Redemption, The (1994)"
## [17] "Silence of the Lambs, The (1991)"
## [18] "Star Wars: Episode IV - A New Hope (1977)"
## [19] "Star Wars: Episode V - The Empire Strikes Back (1980)"
## [20] "Star Wars: Episode VI - Return of the Jedi (1983)"
## [21] "Terminator 2: Judgment Day (1991)"
## [22] "Twelve Monkeys (a.k.a. 12 Monkeys) (1995)"
## [23] "Usual Suspects, The (1995)"
##
## $`3`
## [1] "Gladiator (2000)" "Sixth Sense, The (1999)"
##
## $`4`
## [1] "Independence Day (a.k.a. ID4) (1996)"
## [2] "Mission: Impossible (1996)"
## [3] "Speed (1994)"
## [4] "True Lies (1994)"
table(tree_1)
## tree_1
## 1 2 3 4
## 11 23 2 4
cutree(h_avg, h = 16)
## Aladdin (1992)
## 1
## American Beauty (1999)
## 1
## Apollo 13 (1995)
## 1
## Back to the Future (1985)
## 1
## Batman (1989)
## 1
## Braveheart (1995)
## 1
## Dances with Wolves (1990)
## 1
## Fargo (1996)
## 1
## Fight Club (1999)
## 1
## Forrest Gump (1994)
## 1
## Fugitive, The (1993)
## 1
## Gladiator (2000)
## 1
## Godfather, The (1972)
## 1
## Independence Day (a.k.a. ID4) (1996)
## 1
## Jurassic Park (1993)
## 1
## Lion King, The (1994)
## 1
## Lord of the Rings: The Fellowship of the Ring, The (2001)
## 1
## Lord of the Rings: The Return of the King, The (2003)
## 1
## Lord of the Rings: The Two Towers, The (2002)
## 1
## Matrix, The (1999)
## 1
## Men in Black (a.k.a. MIB) (1997)
## 1
## Mission: Impossible (1996)
## 1
## Pulp Fiction (1994)
## 1
## Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
## 1
## Saving Private Ryan (1998)
## 1
## Schindler's List (1993)
## 1
## Seven (a.k.a. Se7en) (1995)
## 1
## Shawshank Redemption, The (1994)
## 1
## Shrek (2001)
## 1
## Silence of the Lambs, The (1991)
## 1
## Sixth Sense, The (1999)
## 1
## Speed (1994)
## 1
## Star Wars: Episode IV - A New Hope (1977)
## 1
## Star Wars: Episode V - The Empire Strikes Back (1980)
## 1
## Star Wars: Episode VI - Return of the Jedi (1983)
## 1
## Terminator 2: Judgment Day (1991)
## 1
## Toy Story (1995)
## 1
## True Lies (1994)
## 1
## Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
## 1
## Usual Suspects, The (1995)
## 1
# 1.6 k-means
movie_mat <- as.matrix(movie_len)
movie_mat[is.na(movie_mat)] <- 0
kmeans_res <- kmeans(movie_mat, centers = 4)
table(kmeans_res$cluster)
##
## 1 2 3 4
## 12 17 5 6
#1.7 WSS
author <- read.csv('author_count.csv', header = TRUE)
dim(author)
## [1] 841 70
colSums(is.na(author))
## author a all also an and any are as at be
## 0 0 0 0 0 0 0 0 0 0 0
## been but by can do down even every for. from had
## 0 0 0 0 0 0 0 0 0 0 0
## has have her his if. in. into is it its may
## 0 0 0 0 0 0 0 0 0 0 0
## more must my no not now of on one only or
## 0 0 0 0 0 0 0 0 0 0 0
## our should so some such than that the their then there
## 0 0 0 0 0 0 0 0 0 0 0
## things this to up upon was were what when which who
## 0 0 0 0 0 0 0 0 0 0 0
## will with would your
## 0 0 0 0
#turn Authors into a factor
authors_only <- select(author, author)
author_remove <- select(author, - author)
std_author <- scale(author_remove)
pca_result <- prcomp(std_author, scale = TRUE)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
pca_author_df <- data.frame(PC1 = pca_result$x[,1], PC2 = pca_result$x[,2], PC3 = pca_result$x[,3], labels = authors_only)
theme_set(
theme_bw()
)
pca_plot <- ggplot(pca_author_df, aes(x = PC1, y = PC2, color = as.factor(author))) +
geom_point() +
labs(title = "PCA: Authors", color = "Authors")+
theme_minimal()
pca_plot
#2.4
dist_metrics <- c("euclidean", "manhattan", "maximum", "binary", "canberra", "minkowski")
gof_mds <- list()
for (dist in dist_metrics) {
distances <- dist(author_remove, method = dist)
mds_res <- cmdscale(distances, eig = TRUE)
gof_df <- data.frame(gof = mds_res$GOF)
gof_df$dist<- dist
gof_mds[[dist]] <- gof_df
mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
geom_point() +
labs(title = paste("MDS Visualization:", dist, " distance"))
print(mds_plt)
}
explained_var_pca <- pca_result$sdev^2 / sum(pca_result$sdev^2)
expl_pca_df <- data.frame(PCA_Component = 1:length(explained_var_pca), Explained_Variance = explained_var_pca)
## looks like Scree Criterion occurs on 9th value
scree_crit <- explained_var_pca[9]
ggplot(expl_pca_df, aes(x = PCA_Component, y = Explained_Variance)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
geom_hline(yintercept= scree_crit, linetype = "longdash", color = 'red') +
labs(x = "Principal Component", y = "Variance Explained", title = "Scree Plot")
expl_pca_df <- mutate(expl_pca_df, Cumulative_Variance = cumsum(expl_pca_df$Explained_Variance))
#cumulative scree crit
cum_scree <- expl_pca_df[9,3]
ggplot(expl_pca_df, aes(x = PCA_Component, y = Cumulative_Variance)) +
geom_line() +
scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
geom_hline(yintercept= cum_scree, linetype = "longdash", color = 'red') +
labs(x = "Principal Component", y = "Cumulative Variance Explained", title = "Cumulative Variance vs. Principal Components")
Perplexity - Hyperparameter that balances global/ local attention. In a sense it is a guess about the number of close neighbours each point has. - From Google Cloud Article
My understanding:
increasing perplexity -> increased weight to global factors, ensuring perplexity > num attrributes
By definition, cluster Sizes are not directly comparable across perplexities. They get smaller as num neighbours/ perplexity increases
Distances between clusers is highly dependent on number of attributes/ perplexity making it hard to interpret.
#2.3
library(Rtsne)
library(ggpubr)
set.seed(530306627)
perplexity<- c(1, 5, 10, 20)
rtse <- lapply(perplexity, function(x){
y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
attr(y, "perplexity") <- x
y
}
)
tsne_plots <- lapply(rtse, function(dat) {
perplexity <- attr(dat, "perplexity")
dat <- as.data.frame(dat)
names(dat) <- c("x", "y")
dat[["author"]] <- author
ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
labs(title = "t-SNE: Authors", colour = "Authors")+
ggtitle(paste0("Perplexity = ", perplexity))
})
ggarrange(plotlist = tsne_plots, common.legend = TRUE)
perplexity<- c(30, 50, 69, 100)
rtse <- lapply(perplexity, function(x){
y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
attr(y, "perplexity") <- x
y
}
)
tsne_plots <- lapply(rtse, function(dat) {
perplexity <- attr(dat, "perplexity")
dat <- as.data.frame(dat)
names(dat) <- c("x", "y")
dat[["author"]] <- author
ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
labs(title = "t-SNE: Authors", colour = "Authors")+
ggtitle(paste0("Perplexity = ", perplexity))
})
ggarrange(plotlist = tsne_plots, common.legend = TRUE)
From a perplexity of 5 we start to see distinct groups forming.
As we would expect increasing perplexity -> smaller clusters
The most interesting data points are the books that share a vocabulary with another author:
With a perplexity of 5 Shakespeare appears to have 1 book that has a similar vocabulary to Jane Austen’s bibliography. This disappears when we increase perplexity to 10
Jack London has 4 books that share a vocabulary with Jane Austen’s bibliography. As we increase the perplexity we notice these books being ‘pulled’ into Jack London’s cluster. However even with a vocabulary of 69 (good practice maximum as it’s == Sum(attributes)) there is still 1 book that remains in the Jane Austen cluster.
Relative distance between clusters appears to be fairly uniform across perplexities
In order to balance global/ local attention and maintain the interesting overlap data points I would say a perplexity of 15 visually looks the best.
MDS is a measure of relative similarity/ dissimilarity.
In Q2.4 we computed MDS using a number of different distance measures. I will use the Goodness of Fit measure to compare which distance measure does the best job of explaining the variance of similarity of the data set.
References: https://pages.mtu.edu/~shanem/psy5220/daily/Day16/MDS.html
#extracting the eigenvalue list into a dataframe
final_gof_df <- do.call(rbind, gof_mds)
print(final_gof_df)
## gof dist
## euclidean.1 0.5239598 euclidean
## euclidean.2 0.5239598 euclidean
## manhattan.1 0.2306940 manhattan
## manhattan.2 0.2986354 manhattan
## maximum.1 0.3250229 maximum
## maximum.2 0.4624583 maximum
## binary.1 0.1469623 binary
## binary.2 0.2069811 binary
## canberra.1 0.1306663 canberra
## canberra.2 0.1674362 canberra
## minkowski.1 0.5239598 minkowski
## minkowski.2 0.5239598 minkowski
It appears that Euclidean and Minkowski have the best GOF. In fact we see that they are actually the exact same! This is because Minkowski distance by default in the cmdscale function has p = 2 which is the same as Euclidean.
I will investigate other p-values for Minkowski and compare this against Euclidean’s GOF.
p_vals <- seq(2, 6, by = 1)
p_gof_mds <- list()
for (p in p_vals) {
distances <- dist(author_remove, method = dist, p =p)
mds_res <- cmdscale(distances, eig = TRUE)
p_gof_df <- data.frame( gof =mds_res$GOF)
p_gof_df$p<- p
p_gof_mds[[p]] <- p_gof_df
mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
geom_point() +
labs(title = paste("MDS Visualization:", p, " distance"))
print(mds_plt)
}
#extracting the eigenvalue list into a dataframe
final_p_df <- do.call(rbind, p_gof_mds)
print(final_p_df)
## gof p
## 1 0.5239598 2
## 2 0.5239598 2
## 3 0.4714406 3
## 4 0.5393677 3
## 5 0.4437618 4
## 6 0.5416831 4
## 7 0.4213057 5
## 8 0.5346526 5
## 9 0.4039334 6
## 10 0.5257228 6
max_only <- group_by(final_p_df, p) %>% mutate(max_gof = max(gof))
plot(x = max_only$p, y = max_only$max_gof, type = 'b',
ylab = "Goodness of Fit",
xlab = "Power of Minkowski Distance",
main = "GOF vs. P")
It appears that increasing p of the Minkowski has a somewhat parabolic relationship with goodness of fit.
The maximum GOF occurs when p == 4
The best scores for each different method are:
PCA: 9 components account for ~48% of variance being explained
t-SNE: A perplexity = 15 strikes the best balance between global/local attention
MDS: Using Minkowski distance with p = 4 gives the highest GOF score.
From a high level, my understanding of the difference between these measures is that PCA maintains the variance between data points so is directly interpretable. t-SNE just maintains the magnitude in difference between points and MDS just maintains the relative distance.
A PCA with 9 components accounting for ~48% of variance is quite a lot of dimensions. From a visualisation POV this is not very helpful as humans struggle to perceive visualisations in any more than 3 dimensions. However by telling us which 9 dimensions account for ~48% of variance we could use this information to cut down on other dimensions and use one of the other techniques for a more detailed visualisation.
The interesting thing about the t-SNE visualisation is the moving of the book that share a vocabulary with another author. As we increase perplexity the global connection takes over and we get to see these books be ‘drawn’ to their actual author.
The MDS graph gives the best visual representation of how relatively close each authors book’s vocabulary are. Combining this with the information we have gleaned about similar vocabularies from the t-SNE visualisation we can see that the bibligraphies are quite close. With London and Austen having some books that break the general mould.