#Dataset url = http://bit.ly/IrisDataset
iris_df <- read.csv("http://bit.ly/IrisDataset")
head(iris_df)
## sepal_length sepal_width petal_length petal_width species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
#Drop species
iris <- iris_df[c(1:4)]
head(iris)
## sepal_length sepal_width petal_length petal_width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
iris.pca <- prcomp(iris, center = TRUE, scale. = TRUE)
summary(iris.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion 0.7277 0.9580 0.99485 1.00000
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
ggbiplot(iris.pca)
ggbiplot(iris.pca, labels=rownames(iris), obs.scale = 1, var.scale = 1)
ggbiplot(iris.pca,ellipse=TRUE, labels=rownames(iris_df), groups=iris_df$species, obs.scale = 1, var.scale = 1)
ggbiplot(iris.pca,ellipse=TRUE,choices=c(3,4), labels=rownames(iris), groups=iris_df$species)
#\(t-SNE\)
#Removing duplicates
iris_clean <- unique(iris)
library(Rtsne)
# Curating the database for analysis
iris_df$species<-as.factor(iris_df$species)
label<-iris_df$species
# For plotting
colors = rainbow(length(unique(label)))
names(colors) = unique(label)
# Executing the algorithm on curated data
#
tsne <- Rtsne(iris_clean, dims = 2, perplexity=30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 147 x 4 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.16 seconds (sparsity = 0.721181)!
## Learning embedding...
## Iteration 50: error is 45.988995 (50 iterations in 0.26 seconds)
## Iteration 100: error is 44.978908 (50 iterations in 0.06 seconds)
## Iteration 150: error is 45.245843 (50 iterations in 0.18 seconds)
## Iteration 200: error is 45.889853 (50 iterations in 0.16 seconds)
## Iteration 250: error is 43.956327 (50 iterations in 0.11 seconds)
## Iteration 300: error is 0.226706 (50 iterations in 0.06 seconds)
## Iteration 350: error is 0.133463 (50 iterations in 0.06 seconds)
## Iteration 400: error is 0.123750 (50 iterations in 0.17 seconds)
## Iteration 450: error is 0.121119 (50 iterations in 0.21 seconds)
## Iteration 500: error is 0.119249 (50 iterations in 0.23 seconds)
## Fitting performed in 1.50 seconds.
# Plotting our graph and closely examining the graph
#
plot(tsne$Y, t='n', main="tsne")
text(tsne$Y, labels=label, col=colors[label])
# 3. \(DBSCAN\)
library("dbscan")
# We want minimum 4 points with in a distance of eps(0.4)
#
db<-dbscan(iris_clean,eps=0.4,MinPts = 4)
## Warning in dbscan(iris_clean, eps = 0.4, MinPts = 4): converting argument MinPts
## (fpc) to minPts (dbscan)!
print(db)
## DBSCAN clustering for 147 objects.
## Parameters: eps = 0.4, minPts = 4
## The clustering contains 4 cluster(s) and 25 noise points.
##
## 0 1 2 3 4
## 25 45 38 35 4
##
## Available fields: cluster, eps, minPts
#PLotting the clusters
hullplot(iris_clean,db$cluster)
#4. \(K-means\)
# Normalizing the dataset so that no particular attribute
# has more impact on clustering algorithm than others.
# ---
#
iris_norm <- as.data.frame(scale(iris_clean))
head(iris_norm)
## sepal_length sepal_width petal_length petal_width
## 1 -0.9123902 1.0164962 -1.353111 -1.331149
## 2 -1.1536157 -0.1276458 -1.353111 -1.331149
## 3 -1.3948411 0.3300110 -1.409958 -1.331149
## 4 -1.5154539 0.1011826 -1.296264 -1.331149
## 5 -1.0330029 1.2453246 -1.353111 -1.331149
## 6 -0.5505520 1.9318098 -1.182570 -1.067253
set.seed(123)
iris.k <- kmeans(iris_norm, centers = 3, nstart = 25)
print(iris.k)
## K-means clustering with 3 clusters of sizes 52, 48, 47
##
## Cluster means:
## sepal_length sepal_width petal_length petal_width
## 1 -0.06578159 -0.87573859 0.3271523 0.2573021
## 2 -1.02043910 0.85917667 -1.3175816 -1.2651751
## 3 1.11493021 0.09144524 0.9836595 1.0074190
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 36 37 39 40 41 42
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
## 2 2 2 2 2 2 2 2 3 3 3 1 1 1 3 1 1 1 1 1
## 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
## 1 1 1 3 1 1 1 1 3 1 1 1 1 3 3 3 1 1 1 1
## 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
## 1 1 1 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1
## 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
## 3 3 3 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1
## 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
## 3 1 3 3 1 3 3 3 3 3 3 1 1 3 3 3 1 3 3 3
## 144 145 146 147 148 149 150
## 3 3 3 1 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 43.50132 46.35034 47.46552
## (between_SS / total_SS = 76.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(cluster)
fviz_cluster(iris.k, data = iris_norm)
#6. \(Hierarchical Clustering\)
d <- dist(iris_norm, method = "euclidean")
method_hc = c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")
for (h in method_hc){
hc <- hclust(d, method = "ward.D")
hc_median <- hclust(d, method = h)
plot(hc_median, cex = 0.6, hang = -1)
rect.hclust(hc_median , k = 3, border = 2:6)
abline(h = 3, col = 'red')
print(h)
# Compute cophenetic distance
res.coph <- cophenetic(hc)
res.coph_med <- cophenetic(hc_median)
# Correlation between cophenetic distance and
# the original distance
print(cor(d, res.coph))
print(cor(d, res.coph_med))
}
## [1] "ward.D"
## [1] 0.8061101
## [1] 0.8061101
## [1] "ward.D2"
## [1] 0.8061101
## [1] 0.8224484
## [1] "single"
## [1] 0.8061101
## [1] 0.8272214
## [1] "complete"
## [1] 0.8061101
## [1] 0.7477116
## [1] "average"
## [1] 0.8061101
## [1] 0.8527345
## [1] "mcquitty"
## [1] 0.8061101
## [1] 0.7139378
## [1] "median"
## [1] 0.8061101
## [1] 0.6413899
## [1] "centroid"
## [1] 0.8061101
## [1] 0.8453117
From the Hierachical clustering methods with the best clusters prove to be \(Ward, Ward.D2 & Complete\), However, it’s important to note that the cophenetic distance of conveys a different picture. I don’t like it…