\(PCA\)

iris.pca <- prcomp(iris, center = TRUE, scale. = TRUE)
summary(iris.pca)

## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion  0.7277 0.9580 0.99485 1.00000

library(ggbiplot)

## Loading required package: ggplot2

## Loading required package: plyr

## Loading required package: scales

## Loading required package: grid

ggbiplot(iris.pca)

ggbiplot(iris.pca, labels=rownames(iris), obs.scale = 1, var.scale = 1)

ggbiplot(iris.pca,ellipse=TRUE,  labels=rownames(iris_df), groups=iris_df$species, obs.scale = 1, var.scale = 1)

ggbiplot(iris.pca,ellipse=TRUE,choices=c(3,4),   labels=rownames(iris), groups=iris_df$species)

#\(t-SNE\)

#Removing duplicates
iris_clean <- unique(iris)

library(Rtsne)

# Curating the database for analysis 

iris_df$species<-as.factor(iris_df$species)
label<-iris_df$species

# For plotting
colors = rainbow(length(unique(label)))
names(colors) = unique(label)

# Executing the algorithm on curated data
# 


tsne <- Rtsne(iris_clean, dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)

## Performing PCA
## Read the 147 x 4 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.16 seconds (sparsity = 0.721181)!
## Learning embedding...
## Iteration 50: error is 45.988995 (50 iterations in 0.26 seconds)
## Iteration 100: error is 44.978908 (50 iterations in 0.06 seconds)
## Iteration 150: error is 45.245843 (50 iterations in 0.18 seconds)
## Iteration 200: error is 45.889853 (50 iterations in 0.16 seconds)
## Iteration 250: error is 43.956327 (50 iterations in 0.11 seconds)
## Iteration 300: error is 0.226706 (50 iterations in 0.06 seconds)
## Iteration 350: error is 0.133463 (50 iterations in 0.06 seconds)
## Iteration 400: error is 0.123750 (50 iterations in 0.17 seconds)
## Iteration 450: error is 0.121119 (50 iterations in 0.21 seconds)
## Iteration 500: error is 0.119249 (50 iterations in 0.23 seconds)
## Fitting performed in 1.50 seconds.

# Plotting our graph and closely examining the graph
# 
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels=label, col=colors[label])

# 3. \(DBSCAN\)

library("dbscan")

# We want minimum 4 points with in a distance of eps(0.4)
# 
db<-dbscan(iris_clean,eps=0.4,MinPts = 4)

## Warning in dbscan(iris_clean, eps = 0.4, MinPts = 4): converting argument MinPts
## (fpc) to minPts (dbscan)!

print(db)

## DBSCAN clustering for 147 objects.
## Parameters: eps = 0.4, minPts = 4
## The clustering contains 4 cluster(s) and 25 noise points.
## 
##  0  1  2  3  4 
## 25 45 38 35  4 
## 
## Available fields: cluster, eps, minPts

#PLotting the clusters

hullplot(iris_clean,db$cluster)

#4. \(K-means\)

# Normalizing the dataset so that no particular attribute 
# has more impact on clustering algorithm than others.
# ---
# 
iris_norm <- as.data.frame(scale(iris_clean))
head(iris_norm)

##   sepal_length sepal_width petal_length petal_width
## 1   -0.9123902   1.0164962    -1.353111   -1.331149
## 2   -1.1536157  -0.1276458    -1.353111   -1.331149
## 3   -1.3948411   0.3300110    -1.409958   -1.331149
## 4   -1.5154539   0.1011826    -1.296264   -1.331149
## 5   -1.0330029   1.2453246    -1.353111   -1.331149
## 6   -0.5505520   1.9318098    -1.182570   -1.067253

set.seed(123)

iris.k <- kmeans(iris_norm, centers = 3, nstart = 25)
print(iris.k)

## K-means clustering with 3 clusters of sizes 52, 48, 47
## 
## Cluster means:
##   sepal_length sepal_width petal_length petal_width
## 1  -0.06578159 -0.87573859    0.3271523   0.2573021
## 2  -1.02043910  0.85917667   -1.3175816  -1.2651751
## 3   1.11493021  0.09144524    0.9836595   1.0074190
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  36  37  39  40  41  42 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62 
##   2   2   2   2   2   2   2   2   3   3   3   1   1   1   3   1   1   1   1   1 
##  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82 
##   1   1   1   3   1   1   1   1   3   1   1   1   1   3   3   3   1   1   1   1 
##  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 
##   1   1   1   3   3   1   1   1   1   1   1   1   1   1   1   1   1   1   3   1 
## 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 
##   3   3   3   3   1   3   3   3   3   3   3   1   1   3   3   3   3   1   3   1 
## 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 
##   3   1   3   3   1   3   3   3   3   3   3   1   1   3   3   3   1   3   3   3 
## 144 145 146 147 148 149 150 
##   3   3   3   1   3   3   1 
## 
## Within cluster sum of squares by cluster:
## [1] 43.50132 46.35034 47.46552
##  (between_SS / total_SS =  76.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(ggplot2)
library(cluster)
fviz_cluster(iris.k, data = iris_norm)

#6. \(Hierarchical Clustering\)

d <- dist(iris_norm, method = "euclidean")
method_hc = c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")
for (h in method_hc){
  hc <- hclust(d, method = "ward.D")
  hc_median <- hclust(d, method = h)
  plot(hc_median, cex = 0.6, hang = -1)
  rect.hclust(hc_median , k = 3, border = 2:6)
  abline(h = 3, col = 'red')
  print(h)
# Compute cophenetic distance
  res.coph <- cophenetic(hc)
  res.coph_med <- cophenetic(hc_median)
# Correlation between cophenetic distance and
# the original distance
  print(cor(d, res.coph))
  print(cor(d, res.coph_med))
  
}

## [1] "ward.D"
## [1] 0.8061101
## [1] 0.8061101

## [1] "ward.D2"
## [1] 0.8061101
## [1] 0.8224484

## [1] "single"
## [1] 0.8061101
## [1] 0.8272214

## [1] "complete"
## [1] 0.8061101
## [1] 0.7477116

## [1] "average"
## [1] 0.8061101
## [1] 0.8527345

## [1] "mcquitty"
## [1] 0.8061101
## [1] 0.7139378

## [1] "median"
## [1] 0.8061101
## [1] 0.6413899

## [1] "centroid"
## [1] 0.8061101
## [1] 0.8453117

From the Hierachical clustering methods with the best clusters prove to be \(Ward, Ward.D2 & Complete\), However, it’s important to note that the cophenetic distance of conveys a different picture. I don’t like it…

The IRIS dataset with various unsupervised techniques

Ruth Muriithi

1/19/2021

\(PCA\)