Approach A DE II

##28
library(cluster)
data("iris")
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

row.names(iris)

##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150"

## Data Preprocess

sum(!complete.cases(iris))

## [1] 0

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

##Remove or impute missing objects
df <- na.omit(iris)

## Rescale(or normalization, etc)
df[, -5] <- scale(df[, -5], center = TRUE, scale = TRUE)
head(df)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1   -0.8976739  1.01560199    -1.335752   -1.311052  setosa
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052  setosa
## 3   -1.3807271  0.32731751    -1.392399   -1.311052  setosa
## 4   -1.5014904  0.09788935    -1.279104   -1.311052  setosa
## 5   -1.0184372  1.24503015    -1.335752   -1.311052  setosa
## 6   -0.5353840  1.93331463    -1.165809   -1.048667  setosa

summary(df)

##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

## Standardization
apply(df[,-5], 2, sd)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            1            1            1            1

apply(df[,-5], 2, mean)

##  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width 
## -4.484318e-16  2.034094e-16 -2.895326e-17 -3.663049e-17

apply(df, 2, sd)

## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            1            1            1            1           NA

## Distance function and visualization
library(factoextra)

## Warning: package 'factoextra' was built under R version 4.5.1

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

distance <- get_dist(df)

## Warning in stats::dist(x, method = method, ...): NAs introduced by coercion

fviz_dist(distance, gradient = list(low = "#00afbb", mid= "white", high ="#FC4E07"))

#FC4E07

##K means
km_output <- kmeans(df[,-5], centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)

## List of 9
##  $ cluster     : Named int [1:150] 2 2 2 2 2 2 2 2 2 2 ...
##   ..- attr(*, "names")= chr [1:150] "1" "2" "3" "4" ...
##  $ centers     : num [1:2, 1:4] 0.506 -1.011 -0.425 0.85 0.65 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ totss       : num 596
##  $ withinss    : num [1:2] 173.5 47.4
##  $ tot.withinss: num 221
##  $ betweenss   : num 375
##  $ size        : int [1:2] 100 50
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_output)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_output)

## [1] "list"

length(km_output)

## [1] 9

km_output$cluster

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 141 142 143 144 145 146 147 148 149 150 
##   1   1   1   1   1   1   1   1   1   1

##Cluster Validation Evaluation-
##Objective function: Sum of Square Error (SSE)
## SSE

##Cluster Cohesion
## SSE can be used to compare cluster performance only for similar number of clusters

km_output$totss

## [1] 596

km_output$withinss

## [1] 173.52867  47.35062

km_output$betweenss

## [1] 375.1207

sum(c(km_output$withinss, km_output$betweenss))

## [1] 596

cohesion <- sum(km_output$withinss)/km_output$totss
cohesion

## [1] 0.3706028

## Visualize Clusters

fviz_cluster(km_output,data = df[,-5])

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

#df %>% # as.data.frame( )%>% # mutate(cluster=km_output$cluster, Species = #row.names(iris))%>% # ggplot(aes(x = UrbanPop, y = Murder, colour = #factor(cluster), label = Species)) + geom_text()

## Arrest Data

rm(list=ls())

#library(cluster)
#library(tidyverse)

data("USArrests")
str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

row.names(USArrests)

##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"

dataset <-USArrests
#Data Preprocess
sum(!complete.cases(dataset))

## [1] 0

summary(dataset)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

#Remove or impute missing objects
df <- na.omit(dataset)

#Rescale or Normalization, etc
df[, -5] <- scale(df[, -5], center = T, scale = T)
head(df)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

dataset <- df[,-5]
df <- dataset
summary(df)

##      Murder           Assault           UrbanPop             Rape        
##  Min.   :-1.6044   Min.   :-1.5090   Min.   :-2.31714   Min.   :-1.4874  
##  1st Qu.:-0.8525   1st Qu.:-0.7411   1st Qu.:-0.76271   1st Qu.:-0.6574  
##  Median :-0.1235   Median :-0.1411   Median : 0.03178   Median :-0.1209  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.7949   3rd Qu.: 0.9388   3rd Qu.: 0.84354   3rd Qu.: 0.5277  
##  Max.   : 2.2069   Max.   : 1.9948   Max.   : 1.75892   Max.   : 2.6444

apply(dataset,2, sd)

##   Murder  Assault UrbanPop     Rape 
##        1        1        1        1

apply(dataset, 2, mean)

##        Murder       Assault      UrbanPop          Rape 
## -7.663087e-17  1.112408e-16 -4.332808e-16  8.942391e-17

apply(df,2,sd)

##   Murder  Assault UrbanPop     Rape 
##        1        1        1        1

# Distance function and visualization
 library(factoextra)
 distance <- get_dist(df, stand = TRUE, method = "pearson")

fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))

##   Code3:

## K means
km_output <-  kmeans(df[,-5], centers = 2, nstart = 25, iter.max = 100, algorithm = "Hartigan-Wong")
str(km_output)

## List of 9
##  $ cluster     : Named int [1:50] 2 2 2 1 2 2 1 1 2 2 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:2, 1:4] -0.67 1.005 -0.676 1.014 -0.132 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:2] 56.1 46.7
##  $ tot.withinss: num 103
##  $ betweenss   : num 93.1
##  $ size        : int [1:2] 30 20
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"

names(km_output)

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

typeof(km_output)

## [1] "list"

length(km_output)

## [1] 9

km_output$cluster

##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              1              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              1              1              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              1              1              2              1              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              1              1              2              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              1              2              1              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              1 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              1              1 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              1              1              1              1              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              2              2              1              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              1              1              1              1              1

## Cluster Validation Evaluation  -  
## Objective function:  Sum of Square Error (SSE)
### SSE

##    Code4:
#### Cluster cohesion
#### SSE can be used to compare cluster performance only for a similar number of clusters

km_output$totss

## [1] 196

km_output$withinss      # distance without and within clusters

## [1] 56.11445 46.74796

km_output$betweenss

## [1] 93.1376

sum(c(km_output$withinss, km_output$betweenss) )

## [1] 196

 cohesion <-  sum(km_output$withinss)/ km_output$totss
 cohesion

## [1] 0.5248082

 ### Visualize Clusters
 #   library(factoextra)
    fviz_cluster(km_output, data = df)

library(dplyr)
library(ggplot2)
    
    ##        Code5: 
    
    # df    %>%
    # as.data.frame( df )  %>%
  df %>%   mutate(cluster = km_output$cluster, objects_name = row.names(dataset))    %>%
    ggplot(aes(x = UrbanPop, y = Murder, color = factor(km_output$cluster), label = rownames(df) )) + geom_text(  )

### Put Cluster Output on the Map(1)  
cluster_df <-  data.frame(objects_names = tolower(row.names(dataset)), cluster = unname(km_output$cluster))
head(cluster_df)

##   objects_names cluster
## 1       alabama       2
## 2        alaska       2
## 3       arizona       2
## 4      arkansas       1
## 5    california       2
## 6      colorado       2

#install.packages("maps")
cluster_df <- cluster_df %>% rename(state = "objects_names")
library(maps)

## Warning: package 'maps' was built under R version 4.5.1

## 
## Attaching package: 'maps'

## The following object is masked from 'package:cluster':
## 
##     votes.repub

states <- map_data("state")
objects_names <-  map_data("state")
objects_names  %>%
     left_join(cluster_df, by = c("region" = "state"))  %>%
     ggplot( ) +
     geom_polygon(aes(x = long, y = lat, fill = as.factor(cluster)), color = "white") +
     coord_fixed(1.3) +
     guides(fill = F) +
     theme_bw( ) +
     theme(panel.grid.major = element_blank( ), panel.grid.minor = element_blank( ),
           panel.border = element_blank( ),
           axis.line = element_blank( ),
           axis.text = element_blank( ),
           axis.ticks = element_blank( ),
           axis.title = element_blank( ))

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

 ###  Elbow method to decide Optimal Number of Clusters(1)
     
     set.seed(8)
    wss <- function(k) {
     return(kmeans(df, k,  nstart = 25)$tot.withinss)
    }
    
    k_values <- 1:15
    wss_values <-  purrr::map_dbl(k_values, wss)
    plot(x = k_values, y = wss_values,
         type = "b", frame = F,
         xlab = "Number of clusters K",
         ylab = "Total within-clusters sum of square")

  ###  Hierarchical Clustering
    hac_output <- hclust( dist(dataset, method = "euclidean"), method = "complete")
    plot(hac_output)       # Calculating distance using hierarchical clustering, using Euclidean distance

# and using complete linkage for hierarchical clustering
    
    ### Output Desirable Number of Clusters after Modeling
    hac_cut <- cutree(hac_output, 2)
    for ( i in 1:length(hac_cut)) {
     if( hac_cut[i]   != km_output$cluster[i])   print(names(hac_cut) [i])
    }

## [1] "Alabama"
## [1] "Alaska"
## [1] "Arizona"
## [1] "Arkansas"
## [1] "California"
## [1] "Colorado"
## [1] "Connecticut"
## [1] "Delaware"
## [1] "Florida"
## [1] "Georgia"
## [1] "Hawaii"
## [1] "Idaho"
## [1] "Illinois"
## [1] "Indiana"
## [1] "Iowa"
## [1] "Kansas"
## [1] "Kentucky"
## [1] "Louisiana"
## [1] "Maine"
## [1] "Maryland"
## [1] "Massachusetts"
## [1] "Michigan"
## [1] "Minnesota"
## [1] "Mississippi"
## [1] "Montana"
## [1] "Nebraska"
## [1] "Nevada"
## [1] "New Hampshire"
## [1] "New Jersey"
## [1] "New Mexico"
## [1] "New York"
## [1] "North Carolina"
## [1] "North Dakota"
## [1] "Ohio"
## [1] "Oklahoma"
## [1] "Oregon"
## [1] "Pennsylvania"
## [1] "Rhode Island"
## [1] "South Carolina"
## [1] "South Dakota"
## [1] "Tennessee"
## [1] "Texas"
## [1] "Utah"
## [1] "Vermont"
## [1] "Virginia"
## [1] "Washington"
## [1] "West Virginia"
## [1] "Wisconsin"
## [1] "Wyoming"

Approach A DE II

Walter James

2025-08-26