ex1_s

# Load packages
library(kernlab)
library(tidyverse)
library(fpc) #visualisation
library(factoextra) #visualisation
library(vegan)

# Load data into Nxp data matrix.
digitmat=matrix(digits,nrow=600,ncol=28*28,byrow=TRUE)

# Load labels, ie a vector with the actual handwritten numbers
digit.labels = digit.labels[,1]

# Display a digit
digit=matrix(digitmat[2,],nrow=28,ncol=28,byrow=FALSE)
image(digit[,28:1],col=gray(0:200 /200),axes=FALSE)

# In this exercise we will use binarized digits
digitmatbin = digitmat>0.5

set.seed(1234)
# Use k-means model with k=3 
km_d <- kmeans(digitmatbin, centers = 3, nstart = 25)

#str(km_d) # Print contents of the model output

# Extract cluster centers
ckm_d <- km_d$centers


# Typical cluster digits
par(mfrow = c(1, 3))
layout(matrix(seq_len(nrow(ckm_d)), 1, 3, byrow = FALSE))
for(i in seq_len(nrow(ckm_d))) {
  image(matrix(ckm_d[i, ], 28, 28)[, 28:1], 
        col=gray(0:200 /200),axes=FALSE)
}

# Change the centroids named "1" to "4"
km_d2 <- km_d %>%
  rapply(function(x) ifelse(x==1,4,x), how = "replace")

# Actual vs Cluster
km_d_comp <- data.frame(
  cluster = km_d2$cluster,
  actual = digit.labels
) %>%
  group_by(cluster) %>%
  ungroup() %>%
  mutate_all(factor, levels = 2:4) 

summary(km_d_comp)

##  cluster actual 
##  2:152   2:197  
##  3:213   3:190  
##  4:235   4:213

table(actual=digit.labels, cluster=km_d2$cluster)

##       cluster
## actual   2   3   4
##      2 136  45  16
##      3  14 166  10
##      4   2   2 209

plot(table(digit.labels, km_d2$cluster),
      main="Confusion Matrix",
      xlab="Actual", ylab="Cluster")

# Create mode function
mode_fun <- function(x){  
  which.max(tabulate(x))
}

# Comparison to mode
km_d_comp2 <- data.frame(
  cluster = km_d2$cluster,
  actual = digit.labels
) %>%
  group_by(cluster) %>%
  mutate(mode = mode_fun(actual)) %>%
  ungroup() %>%
  mutate_all(factor, levels = 2:4)

summary(km_d_comp2)

##  cluster actual  mode   
##  2:152   2:197   2:152  
##  3:213   3:190   3:213  
##  4:235   4:213   4:235

#plot
vcol <- c("blue","black","red")
plotcluster(digitmatbin, digit.labels, col=vcol[km_d$cluster])

#Conduct spectral clustering
scd <- specc(digitmatbin, centers = 3)


#centers(scd)
size(scd) #Cluster size

## [1] 230 218 152

withinss(scd) #Within-cluster sum of squares

## [1] 37978.51 39932.17 31858.55

dl <- data.frame(digit.labels) %>%
  group_by(digit.labels) %>%
  summarise(count=n())

# Actual amount of digits
dl

## # A tibble: 3 x 2
##   digit.labels count
##          <int> <int>
## 1            2   197
## 2            3   190
## 3            4   213

# Removal of constant columns (for scaling)
digitmatbin2 <- digitmatbin[,apply(digitmatbin, 2, var, na.rm=TRUE) != 0]

# ?princomp=> for Q-mode PCA use prcomp.
pca2 <- prcomp(digitmatbin2, rank. = 6, center = TRUE, scale. = TRUE)
#attributes(pca2)

eigenvals2 = pca2$sdev^2        # SD to variance 
#which(eigenvals2 >= 1)
plot(eigenvals2/eigenvals2[1],ylim=c(0,1), xlim=c(0,15)) # maybe 6

pvar_ex2 <- pca2$sdev^2 / sum (pca2$sdev^2) # % of variance explained

fviz_eig(pca2, ncp = 15) #Plot the percentage of variances explained by each principal component

pc_ve2 <- tibble(pvar_ex2) %>%
  summarise(desc(pvar_ex2))

pc_ve2[1:6,]

## # A tibble: 6 x 1
##   `desc(pvar_ex2)`
##              <dbl>
## 1          -0.0822
## 2          -0.0669
## 3          -0.0553
## 4          -0.0396
## 5          -0.0303
## 6          -0.0263

# The first PC explains only 8.2% of the total variance

# Plot,percent variance exlained as a function of the  number of principal components
plot(cumsum(pvar_ex2), xlab="Principal Component",  ylab="Cumulative Proportion of Variance Explained",main="Principal Components vs % Variance explained",ylim=c(0,1),type='l',lwd=2, col="blue")

#about 420 components explain most variation in the data

dd <- dist(digitmatbin)

diso <- isomap(dd, ndim=2, k=7)

plot(diso, type="n")
points(scores(diso), col="purple")

diso2=isomap(dd, ndim=2, k=8)
plot(diso2, type="n")
points(scores(diso2), col="red")

isopoints <- as.matrix(dist(diso$points))

ddist <-dist(digitmatbin)
ddistm <- as.matrix(ddist)
plot(ddistm,isopoints,main="ISOMAP",xlab="Original", ylab="Estimated")

set.seed(1234)
# Use k-means 
isokm_d <- kmeans(isopoints, centers = 3, nstart = 25)


# Change the centroids named "1" to "4"
ikm_d2 <- isokm_d %>%
  rapply(function(x) ifelse(x==1,4,x), how = "replace")

table(actual = digit.labels, cluster = ikm_d2$cluster)

##       cluster
## actual   2   3   4
##      2 135  55   7
##      3   5 181   4
##      4   3   0 210

#proportions for comparison
prop.table(table(actual=digit.labels, cluster=km_d2$cluster), 1) #original

##       cluster
## actual           2           3           4
##      2 0.690355330 0.228426396 0.081218274
##      3 0.073684211 0.873684211 0.052631579
##      4 0.009389671 0.009389671 0.981220657

prop.table(table(actual = digit.labels, isocluster = ikm_d2$cluster), 1) #iso

##       isocluster
## actual          2          3          4
##      2 0.68527919 0.27918782 0.03553299
##      3 0.02631579 0.95263158 0.02105263
##      4 0.01408451 0.00000000 0.98591549