# Load packages
library(kernlab)
library(tidyverse)
library(fpc) #visualisation
library(factoextra) #visualisation
library(vegan)
# Load data into Nxp data matrix.
digitmat=matrix(digits,nrow=600,ncol=28*28,byrow=TRUE)
# Load labels, ie a vector with the actual handwritten numbers
digit.labels = digit.labels[,1]
# Display a digit
digit=matrix(digitmat[2,],nrow=28,ncol=28,byrow=FALSE)
image(digit[,28:1],col=gray(0:200 /200),axes=FALSE)

# In this exercise we will use binarized digits
digitmatbin = digitmat>0.5
set.seed(1234)
# Use k-means model with k=3
km_d <- kmeans(digitmatbin, centers = 3, nstart = 25)
#str(km_d) # Print contents of the model output
# Extract cluster centers
ckm_d <- km_d$centers
# Typical cluster digits
par(mfrow = c(1, 3))
layout(matrix(seq_len(nrow(ckm_d)), 1, 3, byrow = FALSE))
for(i in seq_len(nrow(ckm_d))) {
image(matrix(ckm_d[i, ], 28, 28)[, 28:1],
col=gray(0:200 /200),axes=FALSE)
}

# Change the centroids named "1" to "4"
km_d2 <- km_d %>%
rapply(function(x) ifelse(x==1,4,x), how = "replace")
# Actual vs Cluster
km_d_comp <- data.frame(
cluster = km_d2$cluster,
actual = digit.labels
) %>%
group_by(cluster) %>%
ungroup() %>%
mutate_all(factor, levels = 2:4)
summary(km_d_comp)
## cluster actual
## 2:152 2:197
## 3:213 3:190
## 4:235 4:213
table(actual=digit.labels, cluster=km_d2$cluster)
## cluster
## actual 2 3 4
## 2 136 45 16
## 3 14 166 10
## 4 2 2 209
plot(table(digit.labels, km_d2$cluster),
main="Confusion Matrix",
xlab="Actual", ylab="Cluster")

# Create mode function
mode_fun <- function(x){
which.max(tabulate(x))
}
# Comparison to mode
km_d_comp2 <- data.frame(
cluster = km_d2$cluster,
actual = digit.labels
) %>%
group_by(cluster) %>%
mutate(mode = mode_fun(actual)) %>%
ungroup() %>%
mutate_all(factor, levels = 2:4)
summary(km_d_comp2)
## cluster actual mode
## 2:152 2:197 2:152
## 3:213 3:190 3:213
## 4:235 4:213 4:235
#plot
vcol <- c("blue","black","red")
plotcluster(digitmatbin, digit.labels, col=vcol[km_d$cluster])

#Conduct spectral clustering
scd <- specc(digitmatbin, centers = 3)
#centers(scd)
size(scd) #Cluster size
## [1] 230 218 152
withinss(scd) #Within-cluster sum of squares
## [1] 37978.51 39932.17 31858.55
dl <- data.frame(digit.labels) %>%
group_by(digit.labels) %>%
summarise(count=n())
# Actual amount of digits
dl
## # A tibble: 3 x 2
## digit.labels count
## <int> <int>
## 1 2 197
## 2 3 190
## 3 4 213
# Removal of constant columns (for scaling)
digitmatbin2 <- digitmatbin[,apply(digitmatbin, 2, var, na.rm=TRUE) != 0]
# ?princomp=> for Q-mode PCA use prcomp.
pca2 <- prcomp(digitmatbin2, rank. = 6, center = TRUE, scale. = TRUE)
#attributes(pca2)
eigenvals2 = pca2$sdev^2 # SD to variance
#which(eigenvals2 >= 1)
plot(eigenvals2/eigenvals2[1],ylim=c(0,1), xlim=c(0,15)) # maybe 6

pvar_ex2 <- pca2$sdev^2 / sum (pca2$sdev^2) # % of variance explained
fviz_eig(pca2, ncp = 15) #Plot the percentage of variances explained by each principal component

pc_ve2 <- tibble(pvar_ex2) %>%
summarise(desc(pvar_ex2))
pc_ve2[1:6,]
## # A tibble: 6 x 1
## `desc(pvar_ex2)`
## <dbl>
## 1 -0.0822
## 2 -0.0669
## 3 -0.0553
## 4 -0.0396
## 5 -0.0303
## 6 -0.0263
# The first PC explains only 8.2% of the total variance
# Plot,percent variance exlained as a function of the number of principal components
plot(cumsum(pvar_ex2), xlab="Principal Component", ylab="Cumulative Proportion of Variance Explained",main="Principal Components vs % Variance explained",ylim=c(0,1),type='l',lwd=2, col="blue")

#about 420 components explain most variation in the data
dd <- dist(digitmatbin)
diso <- isomap(dd, ndim=2, k=7)
plot(diso, type="n")
points(scores(diso), col="purple")

diso2=isomap(dd, ndim=2, k=8)
plot(diso2, type="n")
points(scores(diso2), col="red")

isopoints <- as.matrix(dist(diso$points))
ddist <-dist(digitmatbin)
ddistm <- as.matrix(ddist)
plot(ddistm,isopoints,main="ISOMAP",xlab="Original", ylab="Estimated")

set.seed(1234)
# Use k-means
isokm_d <- kmeans(isopoints, centers = 3, nstart = 25)
# Change the centroids named "1" to "4"
ikm_d2 <- isokm_d %>%
rapply(function(x) ifelse(x==1,4,x), how = "replace")
table(actual = digit.labels, cluster = ikm_d2$cluster)
## cluster
## actual 2 3 4
## 2 135 55 7
## 3 5 181 4
## 4 3 0 210
#proportions for comparison
prop.table(table(actual=digit.labels, cluster=km_d2$cluster), 1) #original
## cluster
## actual 2 3 4
## 2 0.690355330 0.228426396 0.081218274
## 3 0.073684211 0.873684211 0.052631579
## 4 0.009389671 0.009389671 0.981220657
prop.table(table(actual = digit.labels, isocluster = ikm_d2$cluster), 1) #iso
## isocluster
## actual 2 3 4
## 2 0.68527919 0.27918782 0.03553299
## 3 0.02631579 0.95263158 0.02105263
## 4 0.01408451 0.00000000 0.98591549