library(mlbench)
library(ggplot2)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.2.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##row.names(iris)
#Data preprocessing
sum(!complete.cases(iris))
## [1] 0
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
##Remove or impute missing objects
df<-na.omit(iris)
df<-iris[, -5]
##Rescale (or normalization, etc.)
df<-scale(df, center=TRUE, scale=TRUE)
summary(df)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064
str(df)
##  num [1:150, 1:4] -0.898 -1.139 -1.381 -1.501 -1.018 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  - attr(*, "scaled:center")= Named num [1:4] 5.84 3.06 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  - attr(*, "scaled:scale")= Named num [1:4] 0.828 0.436 1.765 0.762
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##Standardization
apply(df, 2, sd)
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1
apply(df, 2, mean)
##  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
## -4.484318e-16  2.034094e-16 -2.895326e-17 -3.663049e-17
distance<- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) 

## K means

km_output<-kmeans(df, centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)
## List of 9
##  $ cluster     : int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
##  $ centers     : num [1:2, 1:4] 0.506 -1.011 -0.425 0.85 0.65 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:2] 173.5 47.4
##  $ tot.withinss: num 221
##  $ betweenss   : num 375
##  $ size        : int [1:2] 100 50
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
names(km_output)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
typeof(km_output)
## [1] "list"
length(km_output)
## [1] 9
km_output$cluster
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1
## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
km_output$totss
## [1] 596
km_output$withinss
## [1] 173.52867  47.35062
km_output$betweenss
## [1] 375.1207
sum(c(km_output$withinss, km_output$betweenss))
## [1] 596
cohesion<-sum(km_output$withinss)/km_output$totss
cohesion
## [1] 0.3706028
### Visualizing clusters
fviz_cluster(km_output, data=df)

##code 5
df1<-as.data.frame(df)
df1<- mutate(df1, cluster=km_output$cluster,objects_names= row.names(df1)) 
ggplot(df1, aes(x =Petal.Length , y = Sepal.Length, color = factor(cluster),label = objects_names)) + geom_text()

##code 7

## Finding number of clusters
set.seed(8)
wss<-function(k) {
     return(kmeans(df, k,  nstart = 25)$tot.withinss)
    }
k_values <-1:15
    wss_values<- purrr::map_dbl(k_values, wss)
  plot(x = k_values, y = wss_values, type = "b", frame = F, xlab = "Number of clusters K",
       ylab = "Total within-clusters sum of square")

##code 6
### Putting Cluster Output on the Map(1)  
  
##
  
##code 8 Hierarchical clustering
# Calculating distance using hierarchical clustering, using Euclidean distance
# and using complete linkage for hierarchical clustering
hac_output <-hclust(dist(df, method = "euclidean"), method = "complete")
plot(hac_output)

### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 2)
##Visualizing the hierarchical clusters 

h_clust1<-data.frame(index=which(hac_cut==1))
h_clust1<-mutate(h_clust1, cluster=1)
h_clust2<-data.frame(index=which(hac_cut==2))
h_clust2<- mutate(h_clust2, cluster=2)
colnames(h_clust2)<-colnames(h_clust1)
h_clust<-rbind(h_clust1, h_clust2)
colnames(h_clust)<-c('index', 'cluster')
ggplot(df1, aes(x =Petal.Length , y = Sepal.Length, color = factor(h_clust$cluster),label = h_clust$index)) + geom_text()

##Printing the mismatched clusters and indexes of objects mismatched
for ( i in 1:length(hac_cut)) {
   if( hac_cut[i]   != df1$cluster[i]){print(hac_cut[i]) ;        print(df1$objects_names[i])}
}
## [1] 1
## [1] "1"
## [1] 1
## [1] "2"
## [1] 1
## [1] "3"
## [1] 1
## [1] "4"
## [1] 1
## [1] "5"
## [1] 1
## [1] "6"
## [1] 1
## [1] "7"
## [1] 1
## [1] "8"
## [1] 1
## [1] "9"
## [1] 1
## [1] "10"
## [1] 1
## [1] "11"
## [1] 1
## [1] "12"
## [1] 1
## [1] "13"
## [1] 1
## [1] "14"
## [1] 1
## [1] "15"
## [1] 1
## [1] "16"
## [1] 1
## [1] "17"
## [1] 1
## [1] "18"
## [1] 1
## [1] "19"
## [1] 1
## [1] "20"
## [1] 1
## [1] "21"
## [1] 1
## [1] "22"
## [1] 1
## [1] "23"
## [1] 1
## [1] "24"
## [1] 1
## [1] "25"
## [1] 1
## [1] "26"
## [1] 1
## [1] "27"
## [1] 1
## [1] "28"
## [1] 1
## [1] "29"
## [1] 1
## [1] "30"
## [1] 1
## [1] "31"
## [1] 1
## [1] "32"
## [1] 1
## [1] "33"
## [1] 1
## [1] "34"
## [1] 1
## [1] "35"
## [1] 1
## [1] "36"
## [1] 1
## [1] "37"
## [1] 1
## [1] "38"
## [1] 1
## [1] "39"
## [1] 1
## [1] "40"
## [1] 1
## [1] "41"
## [1] 1
## [1] "42"
## [1] 1
## [1] "43"
## [1] 1
## [1] "44"
## [1] 1
## [1] "45"
## [1] 1
## [1] "46"
## [1] 1
## [1] "47"
## [1] 1
## [1] "48"
## [1] 1
## [1] "49"
## [1] 1
## [1] "50"
## [1] 2
## [1] "51"
## [1] 2
## [1] "52"
## [1] 2
## [1] "53"
## [1] 2
## [1] "55"
## [1] 2
## [1] "57"
## [1] 2
## [1] "59"
## [1] 2
## [1] "62"
## [1] 2
## [1] "64"
## [1] 2
## [1] "65"
## [1] 2
## [1] "66"
## [1] 2
## [1] "67"
## [1] 2
## [1] "71"
## [1] 2
## [1] "72"
## [1] 2
## [1] "73"
## [1] 2
## [1] "74"
## [1] 2
## [1] "75"
## [1] 2
## [1] "76"
## [1] 2
## [1] "77"
## [1] 2
## [1] "78"
## [1] 2
## [1] "79"
## [1] 2
## [1] "84"
## [1] 2
## [1] "85"
## [1] 2
## [1] "86"
## [1] 2
## [1] "87"
## [1] 2
## [1] "89"
## [1] 2
## [1] "92"
## [1] 2
## [1] "96"
## [1] 2
## [1] "97"
## [1] 2
## [1] "98"
## [1] 2
## [1] "101"
## [1] 2
## [1] "102"
## [1] 2
## [1] "103"
## [1] 2
## [1] "104"
## [1] 2
## [1] "105"
## [1] 2
## [1] "106"
## [1] 2
## [1] "108"
## [1] 2
## [1] "109"
## [1] 2
## [1] "110"
## [1] 2
## [1] "111"
## [1] 2
## [1] "112"
## [1] 2
## [1] "113"
## [1] 2
## [1] "114"
## [1] 2
## [1] "115"
## [1] 2
## [1] "116"
## [1] 2
## [1] "117"
## [1] 2
## [1] "118"
## [1] 2
## [1] "119"
## [1] 2
## [1] "121"
## [1] 2
## [1] "122"
## [1] 2
## [1] "123"
## [1] 2
## [1] "124"
## [1] 2
## [1] "125"
## [1] 2
## [1] "126"
## [1] 2
## [1] "127"
## [1] 2
## [1] "128"
## [1] 2
## [1] "129"
## [1] 2
## [1] "130"
## [1] 2
## [1] "131"
## [1] 2
## [1] "132"
## [1] 2
## [1] "133"
## [1] 2
## [1] "134"
## [1] 2
## [1] "135"
## [1] 2
## [1] "136"
## [1] 2
## [1] "137"
## [1] 2
## [1] "138"
## [1] 2
## [1] "139"
## [1] 2
## [1] "140"
## [1] 2
## [1] "141"
## [1] 2
## [1] "142"
## [1] 2
## [1] "143"
## [1] 2
## [1] "144"
## [1] 2
## [1] "145"
## [1] 2
## [1] "146"
## [1] 2
## [1] "147"
## [1] 2
## [1] "148"
## [1] 2
## [1] "149"
## [1] 2
## [1] "150"
length(which(hac_cut!= df1$cluster))
## [1] 127
#Comparing performance of each clustering method with original dataset
prop.table(table(df1$cluster, iris$Species))
##    
##        setosa versicolor virginica
##   1 0.0000000  0.3333333 0.3333333
##   2 0.3333333  0.0000000 0.0000000
prop.table(table(hac_cut, iris$Species))
##        
## hac_cut     setosa versicolor  virginica
##       1 0.33333333 0.14000000 0.01333333
##       2 0.00000000 0.19333333 0.32000000
(table(hac_cut, df1$cluster))
##        
## hac_cut  1  2
##       1 23 50
##       2 77  0
##Comparing the two approaches 
prop.table(table(hac_cut, df1$cluster))
##        
## hac_cut         1         2
##       1 0.1533333 0.3333333
##       2 0.5133333 0.0000000
##