library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 4")
#1.2
movie_len <- read.csv('movielens_top40.csv')
dim(movie_len)
## [1]  40 153
#1.3
d <- dist(movie_len)
h <-hclust(d)
plot(h, cex = 0.2)

h_avg <- hclust(d, method = 'average')

plot(h_avg, cex = 0.3)

h_comp <- hclust(d, method = 'complete')

plot(h_comp, cex = 0.3)

h_sing <- hclust(d, method = 'single')

plot(h_sing, cex = 0.3)

tree_1 <- cutree(h_avg, k = 4)
head(tree_1)
##            Aladdin (1992)    American Beauty (1999)          Apollo 13 (1995) 
##                         1                         2                         1 
## Back to the Future (1985)             Batman (1989)         Braveheart (1995) 
##                         1                         1                         2
split(names(tree_1), tree_1)
## $`1`
##  [1] "Aladdin (1992)"                   "Apollo 13 (1995)"                
##  [3] "Back to the Future (1985)"        "Batman (1989)"                   
##  [5] "Dances with Wolves (1990)"        "Fugitive, The (1993)"            
##  [7] "Jurassic Park (1993)"             "Lion King, The (1994)"           
##  [9] "Men in Black (a.k.a. MIB) (1997)" "Shrek (2001)"                    
## [11] "Toy Story (1995)"                
## 
## $`2`
##  [1] "American Beauty (1999)"                                                        
##  [2] "Braveheart (1995)"                                                             
##  [3] "Fargo (1996)"                                                                  
##  [4] "Fight Club (1999)"                                                             
##  [5] "Forrest Gump (1994)"                                                           
##  [6] "Godfather, The (1972)"                                                         
##  [7] "Lord of the Rings: The Fellowship of the Ring, The (2001)"                     
##  [8] "Lord of the Rings: The Return of the King, The (2003)"                         
##  [9] "Lord of the Rings: The Two Towers, The (2002)"                                 
## [10] "Matrix, The (1999)"                                                            
## [11] "Pulp Fiction (1994)"                                                           
## [12] "Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)"
## [13] "Saving Private Ryan (1998)"                                                    
## [14] "Schindler's List (1993)"                                                       
## [15] "Seven (a.k.a. Se7en) (1995)"                                                   
## [16] "Shawshank Redemption, The (1994)"                                              
## [17] "Silence of the Lambs, The (1991)"                                              
## [18] "Star Wars: Episode IV - A New Hope (1977)"                                     
## [19] "Star Wars: Episode V - The Empire Strikes Back (1980)"                         
## [20] "Star Wars: Episode VI - Return of the Jedi (1983)"                             
## [21] "Terminator 2: Judgment Day (1991)"                                             
## [22] "Twelve Monkeys (a.k.a. 12 Monkeys) (1995)"                                     
## [23] "Usual Suspects, The (1995)"                                                    
## 
## $`3`
## [1] "Gladiator (2000)"        "Sixth Sense, The (1999)"
## 
## $`4`
## [1] "Independence Day (a.k.a. ID4) (1996)"
## [2] "Mission: Impossible (1996)"          
## [3] "Speed (1994)"                        
## [4] "True Lies (1994)"
table(tree_1)
## tree_1
##  1  2  3  4 
## 11 23  2  4
cutree(h_avg, h = 16)
##                                                                 Aladdin (1992) 
##                                                                              1 
##                                                         American Beauty (1999) 
##                                                                              1 
##                                                               Apollo 13 (1995) 
##                                                                              1 
##                                                      Back to the Future (1985) 
##                                                                              1 
##                                                                  Batman (1989) 
##                                                                              1 
##                                                              Braveheart (1995) 
##                                                                              1 
##                                                      Dances with Wolves (1990) 
##                                                                              1 
##                                                                   Fargo (1996) 
##                                                                              1 
##                                                              Fight Club (1999) 
##                                                                              1 
##                                                            Forrest Gump (1994) 
##                                                                              1 
##                                                           Fugitive, The (1993) 
##                                                                              1 
##                                                               Gladiator (2000) 
##                                                                              1 
##                                                          Godfather, The (1972) 
##                                                                              1 
##                                           Independence Day (a.k.a. ID4) (1996) 
##                                                                              1 
##                                                           Jurassic Park (1993) 
##                                                                              1 
##                                                          Lion King, The (1994) 
##                                                                              1 
##                      Lord of the Rings: The Fellowship of the Ring, The (2001) 
##                                                                              1 
##                          Lord of the Rings: The Return of the King, The (2003) 
##                                                                              1 
##                                  Lord of the Rings: The Two Towers, The (2002) 
##                                                                              1 
##                                                             Matrix, The (1999) 
##                                                                              1 
##                                               Men in Black (a.k.a. MIB) (1997) 
##                                                                              1 
##                                                     Mission: Impossible (1996) 
##                                                                              1 
##                                                            Pulp Fiction (1994) 
##                                                                              1 
## Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 
##                                                                              1 
##                                                     Saving Private Ryan (1998) 
##                                                                              1 
##                                                        Schindler's List (1993) 
##                                                                              1 
##                                                    Seven (a.k.a. Se7en) (1995) 
##                                                                              1 
##                                               Shawshank Redemption, The (1994) 
##                                                                              1 
##                                                                   Shrek (2001) 
##                                                                              1 
##                                               Silence of the Lambs, The (1991) 
##                                                                              1 
##                                                        Sixth Sense, The (1999) 
##                                                                              1 
##                                                                   Speed (1994) 
##                                                                              1 
##                                      Star Wars: Episode IV - A New Hope (1977) 
##                                                                              1 
##                          Star Wars: Episode V - The Empire Strikes Back (1980) 
##                                                                              1 
##                              Star Wars: Episode VI - Return of the Jedi (1983) 
##                                                                              1 
##                                              Terminator 2: Judgment Day (1991) 
##                                                                              1 
##                                                               Toy Story (1995) 
##                                                                              1 
##                                                               True Lies (1994) 
##                                                                              1 
##                                      Twelve Monkeys (a.k.a. 12 Monkeys) (1995) 
##                                                                              1 
##                                                     Usual Suspects, The (1995) 
##                                                                              1
# 1.6 k-means
movie_mat <- as.matrix(movie_len)
movie_mat[is.na(movie_mat)] <- 0
kmeans_res <- kmeans(movie_mat, centers = 4)
table(kmeans_res$cluster)
## 
##  1  2  3  4 
## 12 17  5  6
#1.7 WSS
author <- read.csv('author_count.csv', header = TRUE)
dim(author)
## [1] 841  70
colSums(is.na(author))
## author      a    all   also     an    and    any    are     as     at     be 
##      0      0      0      0      0      0      0      0      0      0      0 
##   been    but     by    can     do   down   even  every   for.   from    had 
##      0      0      0      0      0      0      0      0      0      0      0 
##    has   have    her    his    if.    in.   into     is     it    its    may 
##      0      0      0      0      0      0      0      0      0      0      0 
##   more   must     my     no    not    now     of     on    one   only     or 
##      0      0      0      0      0      0      0      0      0      0      0 
##    our should     so   some   such   than   that    the  their   then  there 
##      0      0      0      0      0      0      0      0      0      0      0 
## things   this     to     up   upon    was   were   what   when  which    who 
##      0      0      0      0      0      0      0      0      0      0      0 
##   will   with  would   your 
##      0      0      0      0
#turn Authors into a factor
authors_only <- select(author, author)

author_remove <- select(author, - author)

std_author <- scale(author_remove)

pca_result <- prcomp(std_author, scale = TRUE)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
pca_author_df <- data.frame(PC1 = pca_result$x[,1], PC2 = pca_result$x[,2], PC3 = pca_result$x[,3], labels = authors_only)
theme_set(
  theme_bw()
)
pca_plot <- ggplot(pca_author_df, aes(x = PC1, y = PC2, color = as.factor(author))) +
            geom_point() +
   labs(title = "PCA: Authors", color = "Authors")+

            theme_minimal()
pca_plot

#2.4
dist_metrics <- c("euclidean", "manhattan", "maximum", "binary", "canberra", "minkowski")

gof_mds <- list()

for (dist in dist_metrics) {
  distances <- dist(author_remove, method = dist)
  
  mds_res <- cmdscale(distances, eig = TRUE)
  
  gof_df <- data.frame(gof = mds_res$GOF)
  gof_df$dist<- dist
  gof_mds[[dist]] <- gof_df
  
  mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
  
  mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
    geom_point() +
    labs(title = paste("MDS Visualization:", dist, " distance"))
  
  print(mds_plt)
}

Question 2.5

PCA

  • Lets try and create a Scree plot for PCA to see how much of the Explained Variance comes from each successive Principal Component
  • According to Sanchita Manghale (https://sanchitamangale12.medium.com/scree-plot-733ed72c8608) we can use the Scree Plot criterion to decide the appropriate number of Principal Components. This is the point where the ‘elbow’ of the Scree plot shallows out.
explained_var_pca <- pca_result$sdev^2 / sum(pca_result$sdev^2)

expl_pca_df <- data.frame(PCA_Component = 1:length(explained_var_pca), Explained_Variance = explained_var_pca)

## looks like Scree Criterion occurs on 9th value 

scree_crit <- explained_var_pca[9]

ggplot(expl_pca_df, aes(x = PCA_Component, y = Explained_Variance)) +
  geom_line() +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  geom_hline(yintercept= scree_crit, linetype = "longdash", color = 'red') +
  labs(x = "Principal Component", y = "Variance Explained", title = "Scree Plot") 

  • 9 Principal Components is very large!
  • and it still only explains ~48% of the variance in the data.
  • Lets make a Cumulative Variance graph to visualise this better
expl_pca_df <- mutate(expl_pca_df, Cumulative_Variance = cumsum(expl_pca_df$Explained_Variance))

#cumulative scree crit
cum_scree <- expl_pca_df[9,3]

ggplot(expl_pca_df, aes(x = PCA_Component, y = Cumulative_Variance)) +
  geom_line() +
    scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
    geom_hline(yintercept= cum_scree, linetype = "longdash", color = 'red') +
  labs(x = "Principal Component", y = "Cumulative Variance Explained", title = "Cumulative Variance vs. Principal Components") 

t-SNE

  • Perplexity - Hyperparameter that balances global/ local attention. In a sense it is a guess about the number of close neighbours each point has. - From Google Cloud Article

  • My understanding:

    • increasing perplexity -> increased weight to global factors, ensuring perplexity > num attrributes

    • By definition, cluster Sizes are not directly comparable across perplexities. They get smaller as num neighbours/ perplexity increases

    • Distances between clusers is highly dependent on number of attributes/ perplexity making it hard to interpret.

  • References: https://distill.pub/2016/misread-tsne/,https://learningwithdata.com/posts/tylerfolkman/why-you-are-using-t-sne-wrong-502412aab0c0/

#2.3
library(Rtsne)
library(ggpubr)
set.seed(530306627)

perplexity<- c(1, 5, 10, 20)

rtse <- lapply(perplexity, function(x){
  y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
  attr(y, "perplexity") <- x
  y
}
  )

tsne_plots <- lapply(rtse, function(dat) {
  perplexity <- attr(dat, "perplexity")
  dat <- as.data.frame(dat)
  names(dat) <- c("x", "y")
  dat[["author"]] <- author
  ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
    labs(title = "t-SNE: Authors", colour = "Authors")+
    ggtitle(paste0("Perplexity = ", perplexity))
  

})

ggarrange(plotlist = tsne_plots, common.legend = TRUE)

perplexity<- c(30, 50, 69, 100)

rtse <- lapply(perplexity, function(x){
  y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
  attr(y, "perplexity") <- x
  y
}
  )

tsne_plots <- lapply(rtse, function(dat) {
  perplexity <- attr(dat, "perplexity")
  dat <- as.data.frame(dat)
  names(dat) <- c("x", "y")
  dat[["author"]] <- author
  ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
    labs(title = "t-SNE: Authors", colour = "Authors")+
    ggtitle(paste0("Perplexity = ", perplexity))
  

})

ggarrange(plotlist = tsne_plots, common.legend = TRUE)

Discussion

  • From a perplexity of 5 we start to see distinct groups forming.

  • As we would expect increasing perplexity -> smaller clusters

  • The most interesting data points are the books that share a vocabulary with another author:

    • With a perplexity of 5 Shakespeare appears to have 1 book that has a similar vocabulary to Jane Austen’s bibliography. This disappears when we increase perplexity to 10

    • Jack London has 4 books that share a vocabulary with Jane Austen’s bibliography. As we increase the perplexity we notice these books being ‘pulled’ into Jack London’s cluster. However even with a vocabulary of 69 (good practice maximum as it’s == Sum(attributes)) there is still 1 book that remains in the Jane Austen cluster.

  • Relative distance between clusters appears to be fairly uniform across perplexities

  • In order to balance global/ local attention and maintain the interesting overlap data points I would say a perplexity of 15 visually looks the best.

MDS

#extracting the eigenvalue list into a dataframe

final_gof_df <- do.call(rbind, gof_mds)
print(final_gof_df)
##                   gof      dist
## euclidean.1 0.5239598 euclidean
## euclidean.2 0.5239598 euclidean
## manhattan.1 0.2306940 manhattan
## manhattan.2 0.2986354 manhattan
## maximum.1   0.3250229   maximum
## maximum.2   0.4624583   maximum
## binary.1    0.1469623    binary
## binary.2    0.2069811    binary
## canberra.1  0.1306663  canberra
## canberra.2  0.1674362  canberra
## minkowski.1 0.5239598 minkowski
## minkowski.2 0.5239598 minkowski

Discussion

  • It appears that Euclidean and Minkowski have the best GOF. In fact we see that they are actually the exact same! This is because Minkowski distance by default in the cmdscale function has p = 2 which is the same as Euclidean.

  • I will investigate other p-values for Minkowski and compare this against Euclidean’s GOF.

p_vals <- seq(2, 6, by = 1)

p_gof_mds <- list()

for (p in p_vals) {

  distances <- dist(author_remove, method = dist, p =p)
  
  mds_res <- cmdscale(distances, eig = TRUE)
  
  p_gof_df <- data.frame( gof =mds_res$GOF)
  p_gof_df$p<- p
  p_gof_mds[[p]] <- p_gof_df
  
  mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
  
  mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
    geom_point() +
    labs(title = paste("MDS Visualization:", p, " distance"))

  print(mds_plt)
  
}

Observations

  • Increasing p seems to make the clusters tighter together. Not clear whether there is any increased explanatory power. Lets look at the Goodness of Fit Measure
#extracting the eigenvalue list into a dataframe

final_p_df <- do.call(rbind, p_gof_mds)

print(final_p_df)
##          gof p
## 1  0.5239598 2
## 2  0.5239598 2
## 3  0.4714406 3
## 4  0.5393677 3
## 5  0.4437618 4
## 6  0.5416831 4
## 7  0.4213057 5
## 8  0.5346526 5
## 9  0.4039334 6
## 10 0.5257228 6
max_only <- group_by(final_p_df, p) %>% mutate(max_gof = max(gof))

plot(x = max_only$p, y = max_only$max_gof, type = 'b',
     ylab = "Goodness of Fit",
     xlab = "Power of Minkowski Distance",
     main = "GOF vs. P")

Discussion

  • It appears that increasing p of the Minkowski has a somewhat parabolic relationship with goodness of fit.

  • The maximum GOF occurs when p == 4

Compare and Contrast our Best Measures: