library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 4")

#1.2
movie_len <- read.csv('movielens_top40.csv')
dim(movie_len)

## [1]  40 153

#1.3
d <- dist(movie_len)
h <-hclust(d)
plot(h, cex = 0.2)

h_avg <- hclust(d, method = 'average')

plot(h_avg, cex = 0.3)

h_comp <- hclust(d, method = 'complete')

plot(h_comp, cex = 0.3)

h_sing <- hclust(d, method = 'single')

plot(h_sing, cex = 0.3)

tree_1 <- cutree(h_avg, k = 4)
head(tree_1)

##            Aladdin (1992)    American Beauty (1999)          Apollo 13 (1995) 
##                         1                         2                         1 
## Back to the Future (1985)             Batman (1989)         Braveheart (1995) 
##                         1                         1                         2

split(names(tree_1), tree_1)

## $`1`
##  [1] "Aladdin (1992)"                   "Apollo 13 (1995)"                
##  [3] "Back to the Future (1985)"        "Batman (1989)"                   
##  [5] "Dances with Wolves (1990)"        "Fugitive, The (1993)"            
##  [7] "Jurassic Park (1993)"             "Lion King, The (1994)"           
##  [9] "Men in Black (a.k.a. MIB) (1997)" "Shrek (2001)"                    
## [11] "Toy Story (1995)"                
## 
## $`2`
##  [1] "American Beauty (1999)"                                                        
##  [2] "Braveheart (1995)"                                                             
##  [3] "Fargo (1996)"                                                                  
##  [4] "Fight Club (1999)"                                                             
##  [5] "Forrest Gump (1994)"                                                           
##  [6] "Godfather, The (1972)"                                                         
##  [7] "Lord of the Rings: The Fellowship of the Ring, The (2001)"                     
##  [8] "Lord of the Rings: The Return of the King, The (2003)"                         
##  [9] "Lord of the Rings: The Two Towers, The (2002)"                                 
## [10] "Matrix, The (1999)"                                                            
## [11] "Pulp Fiction (1994)"                                                           
## [12] "Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)"
## [13] "Saving Private Ryan (1998)"                                                    
## [14] "Schindler's List (1993)"                                                       
## [15] "Seven (a.k.a. Se7en) (1995)"                                                   
## [16] "Shawshank Redemption, The (1994)"                                              
## [17] "Silence of the Lambs, The (1991)"                                              
## [18] "Star Wars: Episode IV - A New Hope (1977)"                                     
## [19] "Star Wars: Episode V - The Empire Strikes Back (1980)"                         
## [20] "Star Wars: Episode VI - Return of the Jedi (1983)"                             
## [21] "Terminator 2: Judgment Day (1991)"                                             
## [22] "Twelve Monkeys (a.k.a. 12 Monkeys) (1995)"                                     
## [23] "Usual Suspects, The (1995)"                                                    
## 
## $`3`
## [1] "Gladiator (2000)"        "Sixth Sense, The (1999)"
## 
## $`4`
## [1] "Independence Day (a.k.a. ID4) (1996)"
## [2] "Mission: Impossible (1996)"          
## [3] "Speed (1994)"                        
## [4] "True Lies (1994)"

table(tree_1)

## tree_1
##  1  2  3  4 
## 11 23  2  4

cutree(h_avg, h = 16)

##                                                                 Aladdin (1992) 
##                                                                              1 
##                                                         American Beauty (1999) 
##                                                                              1 
##                                                               Apollo 13 (1995) 
##                                                                              1 
##                                                      Back to the Future (1985) 
##                                                                              1 
##                                                                  Batman (1989) 
##                                                                              1 
##                                                              Braveheart (1995) 
##                                                                              1 
##                                                      Dances with Wolves (1990) 
##                                                                              1 
##                                                                   Fargo (1996) 
##                                                                              1 
##                                                              Fight Club (1999) 
##                                                                              1 
##                                                            Forrest Gump (1994) 
##                                                                              1 
##                                                           Fugitive, The (1993) 
##                                                                              1 
##                                                               Gladiator (2000) 
##                                                                              1 
##                                                          Godfather, The (1972) 
##                                                                              1 
##                                           Independence Day (a.k.a. ID4) (1996) 
##                                                                              1 
##                                                           Jurassic Park (1993) 
##                                                                              1 
##                                                          Lion King, The (1994) 
##                                                                              1 
##                      Lord of the Rings: The Fellowship of the Ring, The (2001) 
##                                                                              1 
##                          Lord of the Rings: The Return of the King, The (2003) 
##                                                                              1 
##                                  Lord of the Rings: The Two Towers, The (2002) 
##                                                                              1 
##                                                             Matrix, The (1999) 
##                                                                              1 
##                                               Men in Black (a.k.a. MIB) (1997) 
##                                                                              1 
##                                                     Mission: Impossible (1996) 
##                                                                              1 
##                                                            Pulp Fiction (1994) 
##                                                                              1 
## Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 
##                                                                              1 
##                                                     Saving Private Ryan (1998) 
##                                                                              1 
##                                                        Schindler's List (1993) 
##                                                                              1 
##                                                    Seven (a.k.a. Se7en) (1995) 
##                                                                              1 
##                                               Shawshank Redemption, The (1994) 
##                                                                              1 
##                                                                   Shrek (2001) 
##                                                                              1 
##                                               Silence of the Lambs, The (1991) 
##                                                                              1 
##                                                        Sixth Sense, The (1999) 
##                                                                              1 
##                                                                   Speed (1994) 
##                                                                              1 
##                                      Star Wars: Episode IV - A New Hope (1977) 
##                                                                              1 
##                          Star Wars: Episode V - The Empire Strikes Back (1980) 
##                                                                              1 
##                              Star Wars: Episode VI - Return of the Jedi (1983) 
##                                                                              1 
##                                              Terminator 2: Judgment Day (1991) 
##                                                                              1 
##                                                               Toy Story (1995) 
##                                                                              1 
##                                                               True Lies (1994) 
##                                                                              1 
##                                      Twelve Monkeys (a.k.a. 12 Monkeys) (1995) 
##                                                                              1 
##                                                     Usual Suspects, The (1995) 
##                                                                              1

# 1.6 k-means
movie_mat <- as.matrix(movie_len)
movie_mat[is.na(movie_mat)] <- 0
kmeans_res <- kmeans(movie_mat, centers = 4)
table(kmeans_res$cluster)

## 
##  1  2  3  4 
## 12 17  5  6

#1.7 WSS

author <- read.csv('author_count.csv', header = TRUE)
dim(author)

## [1] 841  70

colSums(is.na(author))

## author      a    all   also     an    and    any    are     as     at     be 
##      0      0      0      0      0      0      0      0      0      0      0 
##   been    but     by    can     do   down   even  every   for.   from    had 
##      0      0      0      0      0      0      0      0      0      0      0 
##    has   have    her    his    if.    in.   into     is     it    its    may 
##      0      0      0      0      0      0      0      0      0      0      0 
##   more   must     my     no    not    now     of     on    one   only     or 
##      0      0      0      0      0      0      0      0      0      0      0 
##    our should     so   some   such   than   that    the  their   then  there 
##      0      0      0      0      0      0      0      0      0      0      0 
## things   this     to     up   upon    was   were   what   when  which    who 
##      0      0      0      0      0      0      0      0      0      0      0 
##   will   with  would   your 
##      0      0      0      0

#turn Authors into a factor
authors_only <- select(author, author)

author_remove <- select(author, - author)

std_author <- scale(author_remove)

pca_result <- prcomp(std_author, scale = TRUE)

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

pca_author_df <- data.frame(PC1 = pca_result$x[,1], PC2 = pca_result$x[,2], PC3 = pca_result$x[,3], labels = authors_only)

theme_set(
  theme_bw()
)
pca_plot <- ggplot(pca_author_df, aes(x = PC1, y = PC2, color = as.factor(author))) +
            geom_point() +
   labs(title = "PCA: Authors", color = "Authors")+

            theme_minimal()
pca_plot

#2.4
dist_metrics <- c("euclidean", "manhattan", "maximum", "binary", "canberra", "minkowski")

gof_mds <- list()

for (dist in dist_metrics) {
  distances <- dist(author_remove, method = dist)
  
  mds_res <- cmdscale(distances, eig = TRUE)
  
  gof_df <- data.frame(gof = mds_res$GOF)
  gof_df$dist<- dist
  gof_mds[[dist]] <- gof_df
  
  mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
  
  mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
    geom_point() +
    labs(title = paste("MDS Visualization:", dist, " distance"))
  
  print(mds_plt)
}

Question 2.5

PCA

Lets try and create a Scree plot for PCA to see how much of the Explained Variance comes from each successive Principal Component
According to Sanchita Manghale (https://sanchitamangale12.medium.com/scree-plot-733ed72c8608) we can use the Scree Plot criterion to decide the appropriate number of Principal Components. This is the point where the ‘elbow’ of the Scree plot shallows out.

explained_var_pca <- pca_result$sdev^2 / sum(pca_result$sdev^2)

expl_pca_df <- data.frame(PCA_Component = 1:length(explained_var_pca), Explained_Variance = explained_var_pca)

## looks like Scree Criterion occurs on 9th value 

scree_crit <- explained_var_pca[9]

ggplot(expl_pca_df, aes(x = PCA_Component, y = Explained_Variance)) +
  geom_line() +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
  geom_hline(yintercept= scree_crit, linetype = "longdash", color = 'red') +
  labs(x = "Principal Component", y = "Variance Explained", title = "Scree Plot")

9 Principal Components is very large!
and it still only explains ~48% of the variance in the data.
Lets make a Cumulative Variance graph to visualise this better

expl_pca_df <- mutate(expl_pca_df, Cumulative_Variance = cumsum(expl_pca_df$Explained_Variance))

#cumulative scree crit
cum_scree <- expl_pca_df[9,3]

ggplot(expl_pca_df, aes(x = PCA_Component, y = Cumulative_Variance)) +
  geom_line() +
    scale_x_continuous(breaks = scales::pretty_breaks(n = 20))+
  scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
    geom_hline(yintercept= cum_scree, linetype = "longdash", color = 'red') +
  labs(x = "Principal Component", y = "Cumulative Variance Explained", title = "Cumulative Variance vs. Principal Components")

t-SNE

Perplexity - Hyperparameter that balances global/ local attention. In a sense it is a guess about the number of close neighbours each point has. - From Google Cloud Article
My understanding:
- increasing perplexity -> increased weight to global factors, ensuring perplexity > num attrributes
- By definition, cluster Sizes are not directly comparable across perplexities. They get smaller as num neighbours/ perplexity increases
- Distances between clusers is highly dependent on number of attributes/ perplexity making it hard to interpret.
References: https://distill.pub/2016/misread-tsne/,https://learningwithdata.com/posts/tylerfolkman/why-you-are-using-t-sne-wrong-502412aab0c0/

#2.3
library(Rtsne)
library(ggpubr)
set.seed(530306627)

perplexity<- c(1, 5, 10, 20)

rtse <- lapply(perplexity, function(x){
  y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
  attr(y, "perplexity") <- x
  y
}
  )

tsne_plots <- lapply(rtse, function(dat) {
  perplexity <- attr(dat, "perplexity")
  dat <- as.data.frame(dat)
  names(dat) <- c("x", "y")
  dat[["author"]] <- author
  ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
    labs(title = "t-SNE: Authors", colour = "Authors")+
    ggtitle(paste0("Perplexity = ", perplexity))
  

})

ggarrange(plotlist = tsne_plots, common.legend = TRUE)

perplexity<- c(30, 50, 69, 100)

rtse <- lapply(perplexity, function(x){
  y <- Rtsne(std_author, dims = 2, perplexity = x)$Y
  attr(y, "perplexity") <- x
  y
}
  )

tsne_plots <- lapply(rtse, function(dat) {
  perplexity <- attr(dat, "perplexity")
  dat <- as.data.frame(dat)
  names(dat) <- c("x", "y")
  dat[["author"]] <- author
  ggplot(dat) + geom_point(aes(x =x, y= y, colour = as.factor(author$author))) +
    labs(title = "t-SNE: Authors", colour = "Authors")+
    ggtitle(paste0("Perplexity = ", perplexity))
  

})

ggarrange(plotlist = tsne_plots, common.legend = TRUE)

Discussion

From a perplexity of 5 we start to see distinct groups forming.
As we would expect increasing perplexity -> smaller clusters
The most interesting data points are the books that share a vocabulary with another author:
- With a perplexity of 5 Shakespeare appears to have 1 book that has a similar vocabulary to Jane Austen’s bibliography. This disappears when we increase perplexity to 10
- Jack London has 4 books that share a vocabulary with Jane Austen’s bibliography. As we increase the perplexity we notice these books being ‘pulled’ into Jack London’s cluster. However even with a vocabulary of 69 (good practice maximum as it’s == Sum(attributes)) there is still 1 book that remains in the Jane Austen cluster.
Relative distance between clusters appears to be fairly uniform across perplexities
In order to balance global/ local attention and maintain the interesting overlap data points I would say a perplexity of 15 visually looks the best.

MDS

MDS is a measure of relative similarity/ dissimilarity.
In Q2.4 we computed MDS using a number of different distance measures. I will use the Goodness of Fit measure to compare which distance measure does the best job of explaining the variance of similarity of the data set.
References: https://pages.mtu.edu/~shanem/psy5220/daily/Day16/MDS.html
https://stats.stackexchange.com/questions/22019/how-to-calculate-the-r-squared-value-and-assess-the-model-fit-in-multidimensiona

#extracting the eigenvalue list into a dataframe

final_gof_df <- do.call(rbind, gof_mds)
print(final_gof_df)

##                   gof      dist
## euclidean.1 0.5239598 euclidean
## euclidean.2 0.5239598 euclidean
## manhattan.1 0.2306940 manhattan
## manhattan.2 0.2986354 manhattan
## maximum.1   0.3250229   maximum
## maximum.2   0.4624583   maximum
## binary.1    0.1469623    binary
## binary.2    0.2069811    binary
## canberra.1  0.1306663  canberra
## canberra.2  0.1674362  canberra
## minkowski.1 0.5239598 minkowski
## minkowski.2 0.5239598 minkowski

Discussion

It appears that Euclidean and Minkowski have the best GOF. In fact we see that they are actually the exact same! This is because Minkowski distance by default in the cmdscale function has p = 2 which is the same as Euclidean.
I will investigate other p-values for Minkowski and compare this against Euclidean’s GOF.

p_vals <- seq(2, 6, by = 1)

p_gof_mds <- list()

for (p in p_vals) {

  distances <- dist(author_remove, method = dist, p =p)
  
  mds_res <- cmdscale(distances, eig = TRUE)
  
  p_gof_df <- data.frame( gof =mds_res$GOF)
  p_gof_df$p<- p
  p_gof_mds[[p]] <- p_gof_df
  
  mds_df <- data.frame(X = mds_res[["points"]][,1], Y = mds_res[[ "points"]][,2],author = authors_only)
  
  mds_plt <- ggplot(mds_df, aes(X, Y, color = author)) +
    geom_point() +
    labs(title = paste("MDS Visualization:", p, " distance"))

  print(mds_plt)
  
}

Observations

Increasing p seems to make the clusters tighter together. Not clear whether there is any increased explanatory power. Lets look at the Goodness of Fit Measure

#extracting the eigenvalue list into a dataframe

final_p_df <- do.call(rbind, p_gof_mds)

print(final_p_df)

##          gof p
## 1  0.5239598 2
## 2  0.5239598 2
## 3  0.4714406 3
## 4  0.5393677 3
## 5  0.4437618 4
## 6  0.5416831 4
## 7  0.4213057 5
## 8  0.5346526 5
## 9  0.4039334 6
## 10 0.5257228 6

max_only <- group_by(final_p_df, p) %>% mutate(max_gof = max(gof))

plot(x = max_only$p, y = max_only$max_gof, type = 'b',
     ylab = "Goodness of Fit",
     xlab = "Power of Minkowski Distance",
     main = "GOF vs. P")

Discussion

It appears that increasing p of the Minkowski has a somewhat parabolic relationship with goodness of fit.
The maximum GOF occurs when p == 4

Compare and Contrast our Best Measures:

The best scores for each different method are:
- PCA: 9 components account for ~48% of variance being explained
- t-SNE: A perplexity = 15 strikes the best balance between global/local attention
- MDS: Using Minkowski distance with p = 4 gives the highest GOF score.
From a high level, my understanding of the difference between these measures is that PCA maintains the variance between data points so is directly interpretable. t-SNE just maintains the magnitude in difference between points and MDS just maintains the relative distance.
A PCA with 9 components accounting for ~48% of variance is quite a lot of dimensions. From a visualisation POV this is not very helpful as humans struggle to perceive visualisations in any more than 3 dimensions. However by telling us which 9 dimensions account for ~48% of variance we could use this information to cut down on other dimensions and use one of the other techniques for a more detailed visualisation.
The interesting thing about the t-SNE visualisation is the moving of the book that share a vocabulary with another author. As we increase perplexity the global connection takes over and we get to see these books be ‘drawn’ to their actual author.
The MDS graph gives the best visual representation of how relatively close each authors book’s vocabulary are. Combining this with the information we have gleaned about similar vocabularies from the t-SNE visualisation we can see that the bibligraphies are quite close. With London and Austen having some books that break the general mould.

Tutorial Week 4

2023-08-23

Question 2.5

PCA

t-SNE

Discussion

MDS

Discussion

Observations

Discussion

Compare and Contrast our Best Measures: