library(mlbench)
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##row.names(iris)
#Data preprocessing
sum(!complete.cases(iris))
## [1] 0
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
##Remove or impute missing objects
df<-na.omit(iris)
df<-iris[, -5]
##Rescale (or normalization, etc.)
df<-scale(df, center=TRUE, scale=TRUE)
summary(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.86378 Min. :-2.4258 Min. :-1.5623 Min. :-1.4422
## 1st Qu.:-0.89767 1st Qu.:-0.5904 1st Qu.:-1.2225 1st Qu.:-1.1799
## Median :-0.05233 Median :-0.1315 Median : 0.3354 Median : 0.1321
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.67225 3rd Qu.: 0.5567 3rd Qu.: 0.7602 3rd Qu.: 0.7880
## Max. : 2.48370 Max. : 3.0805 Max. : 1.7799 Max. : 1.7064
str(df)
## num [1:150, 1:4] -0.898 -1.139 -1.381 -1.501 -1.018 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## - attr(*, "scaled:center")= Named num [1:4] 5.84 3.06 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## - attr(*, "scaled:scale")= Named num [1:4] 0.828 0.436 1.765 0.762
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##Standardization
apply(df, 2, sd)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 1 1 1
apply(df, 2, mean)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## -4.484318e-16 2.034094e-16 -2.895326e-17 -3.663049e-17
distance<- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

## K means
km_output<-kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
## $ cluster : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
## $ centers : num [1:2, 1:4] 0.506 -1.011 -0.425 0.85 0.65 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## $ totss : num 596
## $ withinss : num [1:2] 173.5 47.4
## $ tot.withinss: num 221
## $ betweenss : num 375
## $ size : int [1:2] 100 50
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1
## Cluster Validation Evaluation -
## Objective function: Sum of Square Error (SSE)
km_output$totss
## [1] 596
km_output$withinss
## [1] 173.52867 47.35062
km_output$betweenss
## [1] 375.1207
sum(c(km_output$withinss, km_output$betweenss))
## [1] 596
cohesion<-sum(km_output$withinss)/km_output$totss
cohesion
## [1] 0.3706028
### Visualizing clusters
fviz_cluster(km_output, data=df)

##code 5
df1<-as.data.frame(df)
df1<- mutate(df1, cluster=km_output$cluster,objects_names= row.names(df1))
ggplot(df1, aes(x =Petal.Length , y = Sepal.Length, color = factor(cluster),label = objects_names)) + geom_text()

##code 7
## Finding number of clusters
set.seed(8)
wss<-function(k) {
return(kmeans(df, k, nstart = 25)$tot.withinss)
}
k_values <-1:15
wss_values<- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values, type = "b", frame = F, xlab = "Number of clusters K",
ylab = "Total within-clusters sum of square")

##code 6
### Putting Cluster Output on the Map(1)
##
##code 8 Hierarchical clustering
# Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
hac_output <-hclust(dist(df, method = "euclidean"), method = "complete")
plot(hac_output)

### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 2)
##Visualizing the hierarchical clusters
h_clust1<-data.frame(index=which(hac_cut==1))
h_clust1<-mutate(h_clust1, cluster=1)
h_clust2<-data.frame(index=which(hac_cut==2))
h_clust2<- mutate(h_clust2, cluster=2)
colnames(h_clust2)<-colnames(h_clust1)
h_clust<-rbind(h_clust1, h_clust2)
colnames(h_clust)<-c('index', 'cluster')
ggplot(df1, aes(x =Petal.Length , y = Sepal.Length, color = factor(h_clust$cluster),label = h_clust$index)) + geom_text()

##Printing the mismatched clusters and indexes of objects mismatched
for ( i in 1:length(hac_cut)) {
if( hac_cut[i] != df1$cluster[i]){print(hac_cut[i]) ; print(df1$objects_names[i])}
}
## [1] 1
## [1] "1"
## [1] 1
## [1] "2"
## [1] 1
## [1] "3"
## [1] 1
## [1] "4"
## [1] 1
## [1] "5"
## [1] 1
## [1] "6"
## [1] 1
## [1] "7"
## [1] 1
## [1] "8"
## [1] 1
## [1] "9"
## [1] 1
## [1] "10"
## [1] 1
## [1] "11"
## [1] 1
## [1] "12"
## [1] 1
## [1] "13"
## [1] 1
## [1] "14"
## [1] 1
## [1] "15"
## [1] 1
## [1] "16"
## [1] 1
## [1] "17"
## [1] 1
## [1] "18"
## [1] 1
## [1] "19"
## [1] 1
## [1] "20"
## [1] 1
## [1] "21"
## [1] 1
## [1] "22"
## [1] 1
## [1] "23"
## [1] 1
## [1] "24"
## [1] 1
## [1] "25"
## [1] 1
## [1] "26"
## [1] 1
## [1] "27"
## [1] 1
## [1] "28"
## [1] 1
## [1] "29"
## [1] 1
## [1] "30"
## [1] 1
## [1] "31"
## [1] 1
## [1] "32"
## [1] 1
## [1] "33"
## [1] 1
## [1] "34"
## [1] 1
## [1] "35"
## [1] 1
## [1] "36"
## [1] 1
## [1] "37"
## [1] 1
## [1] "38"
## [1] 1
## [1] "39"
## [1] 1
## [1] "40"
## [1] 1
## [1] "41"
## [1] 1
## [1] "42"
## [1] 1
## [1] "43"
## [1] 1
## [1] "44"
## [1] 1
## [1] "45"
## [1] 1
## [1] "46"
## [1] 1
## [1] "47"
## [1] 1
## [1] "48"
## [1] 1
## [1] "49"
## [1] 1
## [1] "50"
## [1] 2
## [1] "51"
## [1] 2
## [1] "52"
## [1] 2
## [1] "53"
## [1] 2
## [1] "55"
## [1] 2
## [1] "57"
## [1] 2
## [1] "59"
## [1] 2
## [1] "62"
## [1] 2
## [1] "64"
## [1] 2
## [1] "65"
## [1] 2
## [1] "66"
## [1] 2
## [1] "67"
## [1] 2
## [1] "71"
## [1] 2
## [1] "72"
## [1] 2
## [1] "73"
## [1] 2
## [1] "74"
## [1] 2
## [1] "75"
## [1] 2
## [1] "76"
## [1] 2
## [1] "77"
## [1] 2
## [1] "78"
## [1] 2
## [1] "79"
## [1] 2
## [1] "84"
## [1] 2
## [1] "85"
## [1] 2
## [1] "86"
## [1] 2
## [1] "87"
## [1] 2
## [1] "89"
## [1] 2
## [1] "92"
## [1] 2
## [1] "96"
## [1] 2
## [1] "97"
## [1] 2
## [1] "98"
## [1] 2
## [1] "101"
## [1] 2
## [1] "102"
## [1] 2
## [1] "103"
## [1] 2
## [1] "104"
## [1] 2
## [1] "105"
## [1] 2
## [1] "106"
## [1] 2
## [1] "108"
## [1] 2
## [1] "109"
## [1] 2
## [1] "110"
## [1] 2
## [1] "111"
## [1] 2
## [1] "112"
## [1] 2
## [1] "113"
## [1] 2
## [1] "114"
## [1] 2
## [1] "115"
## [1] 2
## [1] "116"
## [1] 2
## [1] "117"
## [1] 2
## [1] "118"
## [1] 2
## [1] "119"
## [1] 2
## [1] "121"
## [1] 2
## [1] "122"
## [1] 2
## [1] "123"
## [1] 2
## [1] "124"
## [1] 2
## [1] "125"
## [1] 2
## [1] "126"
## [1] 2
## [1] "127"
## [1] 2
## [1] "128"
## [1] 2
## [1] "129"
## [1] 2
## [1] "130"
## [1] 2
## [1] "131"
## [1] 2
## [1] "132"
## [1] 2
## [1] "133"
## [1] 2
## [1] "134"
## [1] 2
## [1] "135"
## [1] 2
## [1] "136"
## [1] 2
## [1] "137"
## [1] 2
## [1] "138"
## [1] 2
## [1] "139"
## [1] 2
## [1] "140"
## [1] 2
## [1] "141"
## [1] 2
## [1] "142"
## [1] 2
## [1] "143"
## [1] 2
## [1] "144"
## [1] 2
## [1] "145"
## [1] 2
## [1] "146"
## [1] 2
## [1] "147"
## [1] 2
## [1] "148"
## [1] 2
## [1] "149"
## [1] 2
## [1] "150"
length(which(hac_cut!= df1$cluster))
## [1] 127
#Comparing performance of each clustering method with original dataset
prop.table(table(df1$cluster, iris$Species))
##
## setosa versicolor virginica
## 1 0.0000000 0.3333333 0.3333333
## 2 0.3333333 0.0000000 0.0000000
prop.table(table(hac_cut, iris$Species))
##
## hac_cut setosa versicolor virginica
## 1 0.33333333 0.14000000 0.01333333
## 2 0.00000000 0.19333333 0.32000000
(table(hac_cut, df1$cluster))
##
## hac_cut 1 2
## 1 23 50
## 2 77 0
##Comparing the two approaches
prop.table(table(hac_cut, df1$cluster))
##
## hac_cut 1 2
## 1 0.1533333 0.3333333
## 2 0.5133333 0.0000000
##