Code

rm(list = ls())
str(iris )

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

row.names(iris)

##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"

## Data Preprocess
sum(!complete.cases(iris))

## [1] 0

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

## Remove or impute missing objects
df <-  na.omit(iris)

## Rescale (or normalization, etc.)
df <-  scale(df[,-5], center = T, scale = T)

summary(df)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

## Standardization
apply(iris[,-5], 2, sd)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

apply(iris[,-5], 2, mean)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

apply(df, 2, mean)

##  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
## -4.484318e-16  2.034094e-16 -2.895326e-17 -3.663049e-17

apply(df, 2, sd)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1

## Distance function and visualization
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

## K means
# When I ran the code, the elbow plot shows 3 clusters as the best option. That is why I modified the code and put 3 instead of 2 centers.
km_output <- kmeans(df, centers = 3, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)

## List of 9
##  $ cluster     : Named int [1:150] 3 3 3 3 3 3 3 3 3 3 ...
##   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
##  $ centers     : num [1:3, 1:4] -0.0501 1.1322 -1.0112 -0.8804 0.0881 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:3] 44.1 47.5 47.4
##  $ tot.withinss: num 139
##  $ betweenss   : num 457
##  $ size        : int [1:3] 53 47 50
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_output)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_output)

## [1] "list"

length(km_output)

## [1] 9

km_output$cluster

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   3   3   3   3   3   3   3   3   3   3   2   2   2   1   1   1   2   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   2   1   1   1   1   2   1   1   1   1   2   2   2   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   2   2   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   2   1   2   2   2   2   1   2   2   2   2   2   2   1   1   2   2   2   2   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   2   1   2   1   2   2   1   2   2   2   2   2   2   1   1   2   2   2   1   2 
## 141 142 143 144 145 146 147 148 149 150 
##   2   2   1   2   2   2   1   2   2   1

## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
### SSE

#### Cluster cohesion
#### SSE can be used to compare cluster performance only for a similar number of clusters

km_output$totss

## [1] 596

km_output$withinss      # distance without and within clusters

## [1] 44.08754 47.45019 47.35062

km_output$betweenss

## [1] 457.1116

sum(c(km_output$withinss, km_output$betweenss))

## [1] 596

cohesion <- sum(km_output$withinss)/ km_output$totss
cohesion

## [1] 0.2330342

### Visualize Clusters
fviz_cluster(km_output, data = df)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

df    %>%
      as.data.frame(  )  %>%
      mutate(cluster = km_output$cluster, Species = row.names(iris))    %>% 
      ggplot(aes(x = Sepal.Length, y = Sepal.Width, color = factor(cluster), label = Species)) + geom_text(  )

### Put Cluster Output on the Map(1)  
## Goegraphical coordinations are not available for the iris dataset. So I commented this section.
# cluster_df <-  data.frame(Species = tolower(row.names(iris)), cluster = unname(km_output$cluster))
# 
# library(maps)
# Species <- map_data("Species")
# 
# Species  %>%
#     left_join(cluster_df, by = c("region" = "Species"))  %>%
#     ggplot( ) +
#     geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster), group =group), color = "white") +
#     coord_fixed(1.3) +
#     guides(fill = F) +
#     theme_bw( ) +
#     theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
#                panel.border = element_blank( ),
#                axis.line = element_blank( ),
#                axis.text = element_blank( ),
#                axis.ticks = element_blank( ),
#                axis.title = element_blank( ))    

###  Elbow method to decide Optimal Number of Clusters(1)

set.seed(8)
wss <- function(k) {
        return(kmeans(df, k,  nstart = 25)$tot.withinss)
}

k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)

plot(x = k_values, y = wss_values,
       type = "b", frame = F,
       xlab = "Number of clusters K",
       ylab = "Total within-clusters sum of square")

###  Hierarchical Clustering
hac_output <- hclust(dist(iris, method = "euclidean"), method = "complete")

## Warning in dist(iris, method = "euclidean"): NAs introduced by coercion

plot(hac_output)       # Calculating distance using hierarchical clustering, using Euclidean distance

                       # and using complete linkage for hierarchical clustering

### Output Desirable Number of Clusters after Modeling
hac_cut <- cutree(hac_output, 3)
for ( i in 1:length(hac_cut)) {
    if( hac_cut[i]   != km_output$cluster[i])   print(names(hac_cut) [i])
}

## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL
## NULL

DataMining2_Assignment3 (Iris Dataset)

Soudeh Khoubrouy

2025-01-31

Code