Info about iris

dataset <- iris
label_col = 5

str(dataset)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

row.names(dataset)

##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"

summary(dataset)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Preprocessing

## Check for incomplete cases
print(paste("The number of incomplete cases is",
            sum(!complete.cases(dataset))
           ))

## [1] "The number of incomplete cases is 0"

## Remove or impute missing objects
df <- na.omit(dataset)

## Rescale (or normalization, etc.)
df <- df[,-label_col]
df <- scale(df, center = T, scale = T)

summary(df)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

Visualization of distance between cases

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.4.2

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

distance <- get_dist(df)
fviz_dist(distance,
          gradient = list(low="#00AFBB", 
                          mid = "white", 
                          high = "#FC4E07"))

K-means Algorithm

km_output <- kmeans(df, centers = 3, nstart = 25, iter.max = 100, 
                    algorithm = "Hartigan-Wong")
str(km_output)

## List of 9
##  $ cluster     : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
##   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
##  $ centers     : num [1:3, 1:4] -0.0501 -1.0112 1.1322 -0.8804 0.8504 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:3] 44.1 47.4 47.5
##  $ tot.withinss: num 139
##  $ betweenss   : num 457
##  $ size        : int [1:3] 53 50 47
##  $ iter        : int 2
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_output)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_output)

## [1] "list"

length(km_output)

## [1] 9

km_output$cluster

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   2   2   2   2   2   2   2   3   3   3   1   1   1   3   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   3   1   1   1   1   3   1   1   1   1   3   3   3   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   3   3   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   3   1   3   3   3   3   1   3   3   3   3   3   3   1   1   3   3   3   3   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   3   1   3   1   3   3   1   3   3   3   3   3   3   1   1   3   3   3   1   3 
## 141 142 143 144 145 146 147 148 149 150 
##   3   3   1   3   3   3   1   3   3   1

Cluster Evaluation

## Sum of Squares
print(paste("SST:", km_output$totss))

## [1] "SST: 596"

print("SSwithin:")

## [1] "SSwithin:"

km_output$withinss

## [1] 44.08754 47.35062 47.45019

print(paste("SSbetween:", km_output$betweenss))

## [1] "SSbetween: 457.111640282649"

print(paste("SSwithin + SSbetween:",
            sum(c(km_output$withinss, km_output$betweenss))))

## [1] "SSwithin + SSbetween: 596"

cohesion <- sum(km_output$withinss)/ km_output$totss
print(paste("cohesion:", cohesion))

## [1] "cohesion: 0.233034160599583"

## Visualizing Clusters
fviz_cluster(km_output, data = df)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
dataset %>%
  # as.data.frame(  ) %>%
  mutate(cluster = km_output$cluster, 
         objects_name = row.names(dataset)) %>% 
  ggplot(aes(x = Petal.Width, y = Sepal.Length,
             label = factor(cluster), 
             color = Species)) + geom_text(  )

Elbow method for the optimal number of cluster

set.seed(8)
wss <- function(k) {
        return(kmeans(df, k,  nstart = 25)$tot.withinss)
}

k_values <- 1:15
wss_values <- purrr::map_dbl(k_values, wss)
plot(x = k_values, y = wss_values,
       type = "b", frame = F,
       xlab = "Number of clusters K",
       ylab = "Total within-clusters sum of square")

We see an elbow at \(k=2\), which matches the two clear clusters in the scatterplot of Sepal.Length vs Petal.Width earlier. However, we know that there are three species present in the data, and indeed we see a less sharp “elbow” at \(k=3\).

Hierarchical clustering

We use Euclidean distance and complete linkage for the hclus() function.

hac_output <- hclust(dist(dataset, method = "euclidean"), method = "complete")
plot(hac_output)

hac_cut <- cutree(hac_output, 3)

table(hac_cut)

## hac_cut
##  1  2  3 
## 50 72 28

km_cluster <- km_output$cluster
table(km_cluster)

## km_cluster
##  1  2  3 
## 53 50 47

The hierarchical method differs from the k-means method for the three clusters each identifies, as shown in the tables above.

Data Mining II - Assign 3 - Question 28