1 Load Dataset

df_cancer <- read.csv("breast-cancer.csv")
head(df_cancer)
        id diagnosis radius_mean texture_mean perimeter_mean area_mean
1   842302         M       17.99        10.38         122.80    1001.0
2   842517         M       20.57        17.77         132.90    1326.0
3 84300903         M       19.69        21.25         130.00    1203.0
4 84348301         M       11.42        20.38          77.58     386.1
5 84358402         M       20.29        14.34         135.10    1297.0
6   843786         M       12.45        15.70          82.57     477.1
  smoothness_mean compactness_mean concavity_mean concave.points_mean
1         0.11840          0.27760         0.3001             0.14710
2         0.08474          0.07864         0.0869             0.07017
3         0.10960          0.15990         0.1974             0.12790
4         0.14250          0.28390         0.2414             0.10520
5         0.10030          0.13280         0.1980             0.10430
6         0.12780          0.17000         0.1578             0.08089
  symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
1        0.2419                0.07871    1.0950     0.9053        8.589
2        0.1812                0.05667    0.5435     0.7339        3.398
3        0.2069                0.05999    0.7456     0.7869        4.585
4        0.2597                0.09744    0.4956     1.1560        3.445
5        0.1809                0.05883    0.7572     0.7813        5.438
6        0.2087                0.07613    0.3345     0.8902        2.217
  area_se smoothness_se compactness_se concavity_se concave.points_se
1  153.40      0.006399        0.04904      0.05373           0.01587
2   74.08      0.005225        0.01308      0.01860           0.01340
3   94.03      0.006150        0.04006      0.03832           0.02058
4   27.23      0.009110        0.07458      0.05661           0.01867
5   94.44      0.011490        0.02461      0.05688           0.01885
6   27.19      0.007510        0.03345      0.03672           0.01137
  symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
1     0.03003             0.006193        25.38         17.33          184.60
2     0.01389             0.003532        24.99         23.41          158.80
3     0.02250             0.004571        23.57         25.53          152.50
4     0.05963             0.009208        14.91         26.50           98.87
5     0.01756             0.005115        22.54         16.67          152.20
6     0.02165             0.005082        15.47         23.75          103.40
  area_worst smoothness_worst compactness_worst concavity_worst
1     2019.0           0.1622            0.6656          0.7119
2     1956.0           0.1238            0.1866          0.2416
3     1709.0           0.1444            0.4245          0.4504
4      567.7           0.2098            0.8663          0.6869
5     1575.0           0.1374            0.2050          0.4000
6      741.6           0.1791            0.5249          0.5355
  concave.points_worst symmetry_worst fractal_dimension_worst
1               0.2654         0.4601                 0.11890
2               0.1860         0.2750                 0.08902
3               0.2430         0.3613                 0.08758
4               0.2575         0.6638                 0.17300
5               0.1625         0.2364                 0.07678
6               0.1741         0.3985                 0.12440

2 EDA

2.1 Cek Tipe Data

str(df_cancer)
'data.frame':   569 obs. of  32 variables:
 $ id                     : int  842302 842517 84300903 84348301 84358402 843786 844359 84458202 844981 84501001 ...
 $ diagnosis              : chr  "M" "M" "M" "M" ...
 $ radius_mean            : num  18 20.6 19.7 11.4 20.3 ...
 $ texture_mean           : num  10.4 17.8 21.2 20.4 14.3 ...
 $ perimeter_mean         : num  122.8 132.9 130 77.6 135.1 ...
 $ area_mean              : num  1001 1326 1203 386 1297 ...
 $ smoothness_mean        : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
 $ compactness_mean       : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
 $ concavity_mean         : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
 $ concave.points_mean    : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
 $ symmetry_mean          : num  0.242 0.181 0.207 0.26 0.181 ...
 $ fractal_dimension_mean : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
 $ radius_se              : num  1.095 0.543 0.746 0.496 0.757 ...
 $ texture_se             : num  0.905 0.734 0.787 1.156 0.781 ...
 $ perimeter_se           : num  8.59 3.4 4.58 3.44 5.44 ...
 $ area_se                : num  153.4 74.1 94 27.2 94.4 ...
 $ smoothness_se          : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
 $ compactness_se         : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
 $ concavity_se           : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
 $ concave.points_se      : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
 $ symmetry_se            : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
 $ fractal_dimension_se   : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
 $ radius_worst           : num  25.4 25 23.6 14.9 22.5 ...
 $ texture_worst          : num  17.3 23.4 25.5 26.5 16.7 ...
 $ perimeter_worst        : num  184.6 158.8 152.5 98.9 152.2 ...
 $ area_worst             : num  2019 1956 1709 568 1575 ...
 $ smoothness_worst       : num  0.162 0.124 0.144 0.21 0.137 ...
 $ compactness_worst      : num  0.666 0.187 0.424 0.866 0.205 ...
 $ concavity_worst        : num  0.712 0.242 0.45 0.687 0.4 ...
 $ concave.points_worst   : num  0.265 0.186 0.243 0.258 0.163 ...
 $ symmetry_worst         : num  0.46 0.275 0.361 0.664 0.236 ...
 $ fractal_dimension_worst: num  0.1189 0.089 0.0876 0.173 0.0768 ...

2.2 Statistika Deskriptif

summary(df_cancer)
       id             diagnosis          radius_mean      texture_mean  
 Min.   :     8670   Length:569         Min.   : 6.981   Min.   : 9.71  
 1st Qu.:   869218   Class :character   1st Qu.:11.700   1st Qu.:16.17  
 Median :   906024   Mode  :character   Median :13.370   Median :18.84  
 Mean   : 30371831                      Mean   :14.127   Mean   :19.29  
 3rd Qu.:  8813129                      3rd Qu.:15.780   3rd Qu.:21.80  
 Max.   :911320502                      Max.   :28.110   Max.   :39.28  
 perimeter_mean     area_mean      smoothness_mean   compactness_mean 
 Min.   : 43.79   Min.   : 143.5   Min.   :0.05263   Min.   :0.01938  
 1st Qu.: 75.17   1st Qu.: 420.3   1st Qu.:0.08637   1st Qu.:0.06492  
 Median : 86.24   Median : 551.1   Median :0.09587   Median :0.09263  
 Mean   : 91.97   Mean   : 654.9   Mean   :0.09636   Mean   :0.10434  
 3rd Qu.:104.10   3rd Qu.: 782.7   3rd Qu.:0.10530   3rd Qu.:0.13040  
 Max.   :188.50   Max.   :2501.0   Max.   :0.16340   Max.   :0.34540  
 concavity_mean    concave.points_mean symmetry_mean    fractal_dimension_mean
 Min.   :0.00000   Min.   :0.00000     Min.   :0.1060   Min.   :0.04996       
 1st Qu.:0.02956   1st Qu.:0.02031     1st Qu.:0.1619   1st Qu.:0.05770       
 Median :0.06154   Median :0.03350     Median :0.1792   Median :0.06154       
 Mean   :0.08880   Mean   :0.04892     Mean   :0.1812   Mean   :0.06280       
 3rd Qu.:0.13070   3rd Qu.:0.07400     3rd Qu.:0.1957   3rd Qu.:0.06612       
 Max.   :0.42680   Max.   :0.20120     Max.   :0.3040   Max.   :0.09744       
   radius_se        texture_se      perimeter_se       area_se       
 Min.   :0.1115   Min.   :0.3602   Min.   : 0.757   Min.   :  6.802  
 1st Qu.:0.2324   1st Qu.:0.8339   1st Qu.: 1.606   1st Qu.: 17.850  
 Median :0.3242   Median :1.1080   Median : 2.287   Median : 24.530  
 Mean   :0.4052   Mean   :1.2169   Mean   : 2.866   Mean   : 40.337  
 3rd Qu.:0.4789   3rd Qu.:1.4740   3rd Qu.: 3.357   3rd Qu.: 45.190  
 Max.   :2.8730   Max.   :4.8850   Max.   :21.980   Max.   :542.200  
 smoothness_se      compactness_se      concavity_se     concave.points_se 
 Min.   :0.001713   Min.   :0.002252   Min.   :0.00000   Min.   :0.000000  
 1st Qu.:0.005169   1st Qu.:0.013080   1st Qu.:0.01509   1st Qu.:0.007638  
 Median :0.006380   Median :0.020450   Median :0.02589   Median :0.010930  
 Mean   :0.007041   Mean   :0.025478   Mean   :0.03189   Mean   :0.011796  
 3rd Qu.:0.008146   3rd Qu.:0.032450   3rd Qu.:0.04205   3rd Qu.:0.014710  
 Max.   :0.031130   Max.   :0.135400   Max.   :0.39600   Max.   :0.052790  
  symmetry_se       fractal_dimension_se  radius_worst   texture_worst  
 Min.   :0.007882   Min.   :0.0008948    Min.   : 7.93   Min.   :12.02  
 1st Qu.:0.015160   1st Qu.:0.0022480    1st Qu.:13.01   1st Qu.:21.08  
 Median :0.018730   Median :0.0031870    Median :14.97   Median :25.41  
 Mean   :0.020542   Mean   :0.0037949    Mean   :16.27   Mean   :25.68  
 3rd Qu.:0.023480   3rd Qu.:0.0045580    3rd Qu.:18.79   3rd Qu.:29.72  
 Max.   :0.078950   Max.   :0.0298400    Max.   :36.04   Max.   :49.54  
 perimeter_worst    area_worst     smoothness_worst  compactness_worst
 Min.   : 50.41   Min.   : 185.2   Min.   :0.07117   Min.   :0.02729  
 1st Qu.: 84.11   1st Qu.: 515.3   1st Qu.:0.11660   1st Qu.:0.14720  
 Median : 97.66   Median : 686.5   Median :0.13130   Median :0.21190  
 Mean   :107.26   Mean   : 880.6   Mean   :0.13237   Mean   :0.25427  
 3rd Qu.:125.40   3rd Qu.:1084.0   3rd Qu.:0.14600   3rd Qu.:0.33910  
 Max.   :251.20   Max.   :4254.0   Max.   :0.22260   Max.   :1.05800  
 concavity_worst  concave.points_worst symmetry_worst   fractal_dimension_worst
 Min.   :0.0000   Min.   :0.00000      Min.   :0.1565   Min.   :0.05504        
 1st Qu.:0.1145   1st Qu.:0.06493      1st Qu.:0.2504   1st Qu.:0.07146        
 Median :0.2267   Median :0.09993      Median :0.2822   Median :0.08004        
 Mean   :0.2722   Mean   :0.11461      Mean   :0.2901   Mean   :0.08395        
 3rd Qu.:0.3829   3rd Qu.:0.16140      3rd Qu.:0.3179   3rd Qu.:0.09208        
 Max.   :1.2520   Max.   :0.29100      Max.   :0.6638   Max.   :0.20750        

2.3 Cek misiing value

colSums(is.na(df_cancer))
                     id               diagnosis             radius_mean 
                      0                       0                       0 
           texture_mean          perimeter_mean               area_mean 
                      0                       0                       0 
        smoothness_mean        compactness_mean          concavity_mean 
                      0                       0                       0 
    concave.points_mean           symmetry_mean  fractal_dimension_mean 
                      0                       0                       0 
              radius_se              texture_se            perimeter_se 
                      0                       0                       0 
                area_se           smoothness_se          compactness_se 
                      0                       0                       0 
           concavity_se       concave.points_se             symmetry_se 
                      0                       0                       0 
   fractal_dimension_se            radius_worst           texture_worst 
                      0                       0                       0 
        perimeter_worst              area_worst        smoothness_worst 
                      0                       0                       0 
      compactness_worst         concavity_worst    concave.points_worst 
                      0                       0                       0 
         symmetry_worst fractal_dimension_worst 
                      0                       0 

2.4 Distribusi Data

df_cancer %>% 
  select(where(is.numeric)) %>% 
  gather() %>% 
  ggplot(aes(value)) +
  geom_histogram(bins = 30, fill = "steelblue") +
  facet_wrap(~key, scales = "free")

2.5 Corelation Matrix

num_data <- df_cancer %>% select(where(is.numeric))

cor_matrix <- cor(num_data)
corrplot(cor_matrix, method = "color", tl.cex = 0.7)

3 Preprocessing

3.1 Remove kolom tidak perlu

data_clean <- df_cancer %>% select(-id) %>% select(where(is.numeric))

3.2 Standarisasi

data_scaled <- scale(data_clean)

4 Clustering K-Means

pca <- prcomp(data_scaled, scale = TRUE)
summary(pca)
Importance of components:
                          PC1    PC2     PC3     PC4     PC5     PC6     PC7
Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
                           PC8    PC9    PC10   PC11    PC12    PC13    PC14
Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
                          PC15    PC16    PC17    PC18    PC19    PC20   PC21
Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
                          PC22    PC23   PC24    PC25    PC26    PC27    PC28
Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
                          PC29    PC30
Standard deviation     0.02736 0.01153
Proportion of Variance 0.00002 0.00000
Cumulative Proportion  1.00000 1.00000
fviz_eig(pca)

data_pca <- pca$x[, 1:2]
set.seed(123)
kmeans_model <- kmeans(data_scaled, centers = 3, nstart = 25)

kmeans_model$cluster
  [1] 3 3 3 2 3 2 3 2 2 2 1 2 3 1 2 2 1 2 3 1 1 1 2 3 3 3 2 3 2 3 3 2 3 3 2 2 2
 [38] 1 1 2 1 2 3 2 1 3 1 2 1 1 1 1 1 3 1 1 3 2 1 1 1 1 2 1 2 2 1 1 2 1 3 2 2 1
 [75] 1 3 1 3 3 1 1 2 3 3 1 3 1 3 1 2 1 1 1 1 2 3 1 1 1 2 1 1 1 1 1 2 1 1 3 1 1
[112] 2 2 1 1 1 1 2 2 1 1 3 3 1 1 1 1 3 2 3 1 1 1 1 3 1 1 1 3 1 1 1 1 1 1 2 2 1
[149] 1 1 1 2 2 1 1 1 3 1 1 1 1 3 3 1 3 1 1 1 3 1 1 1 2 1 1 1 2 2 1 1 3 3 1 1 1
[186] 1 1 1 1 1 2 1 1 2 2 1 2 3 3 2 1 3 3 2 1 1 1 1 2 1 3 1 3 2 2 2 2 1 3 3 1 1
[223] 1 2 1 1 1 1 1 2 2 1 1 3 1 1 3 3 1 3 1 1 2 1 3 1 1 2 1 1 3 1 3 1 3 1 3 2 3
[260] 2 3 1 3 1 3 3 1 1 1 2 1 1 3 1 1 1 1 1 1 1 3 1 3 2 1 1 1 1 2 1 2 1 1 1 1 1
[297] 1 1 1 1 3 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 2 1 1 3 1 3 1 1 1 1 2 2 2 1 1
[334] 1 1 3 1 3 1 3 1 1 1 3 1 1 1 1 1 1 1 2 3 2 1 1 2 1 1 1 1 1 1 1 1 3 3 1 3 3
[371] 2 1 3 3 1 1 2 1 1 2 1 1 1 2 1 1 1 1 2 3 1 1 2 3 1 1 1 1 1 1 2 1 1 1 1 1 1
[408] 1 3 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 1 2 1 3 3 1 2 1 1 1 1 2 3 1 1
[445] 1 1 3 1 1 3 1 3 1 1 1 1 1 1 1 1 3 3 1 1 1 2 1 1 3 2 1 1 1 1 1 1 1 1 1 2 1
[482] 1 1 1 1 2 1 3 1 1 1 1 3 1 1 1 2 1 3 3 1 2 1 3 2 2 1 2 1 2 1 1 2 1 1 1 3 3
[519] 1 1 2 3 1 1 1 1 1 1 1 1 1 1 1 3 1 3 1 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
[556] 1 1 1 1 1 1 1 2 3 3 3 1 3 1
fviz_cluster(kmeans_model, data = data_scaled)

cancer_K2 <- kmeans(data_scaled, centers = 2, nstart = 25)
cancer_K3 <- kmeans(data_scaled, centers = 3, nstart = 25)
cancer_K4 <- kmeans(data_scaled, centers = 4, nstart = 25)
cancer_K5 <- kmeans(data_scaled, centers = 5, nstart = 25)
p1 <- fviz_cluster(cancer_K2, geom = "point", data = data_scaled) + ggtitle(" K = 2")
p2 <- fviz_cluster(cancer_K3, geom = "point", data = data_scaled) + ggtitle(" K = 3")
p3 <- fviz_cluster(cancer_K4, geom = "point", data = data_scaled) + ggtitle(" K = 4")
p4 <- fviz_cluster(cancer_K5, geom = "point", data = data_scaled) + ggtitle(" K = 5")

grid.arrange(p1, p2, p3, p4, nrow = 2)

fviz_nbclust(x = data_scaled,FUNcluster = kmeans, method = 'silhouette' )

fviz_nbclust(data_scaled, kmeans, method = "wss")

# compute gap statistic
set.seed(123)
gap_stat <- clusGap(x = data_scaled, FUN = kmeans, K.max = 15, nstart = 25, B = 50 )

# Print the result
print(gap_stat, method = "firstmax")
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = data_scaled, FUNcluster = kmeans, K.max = 15, B = 50, nstart = 25)
B=50 simulated reference sets, k = 1..15; spaceH0="scaledPCA"
 --> Number of clusters (method 'firstmax'): 2
          logW   E.logW       gap      SE.sim
 [1,] 6.903237 7.802259 0.8990224 0.006603625
 [2,] 6.708914 7.696045 0.9871308 0.006571583
 [3,] 6.646568 7.630162 0.9835946 0.005454643
 [4,] 6.609021 7.583872 0.9748510 0.005193434
 [5,] 6.561763 7.556734 0.9949712 0.005108036
 [6,] 6.532478 7.533734 1.0012561 0.005221217
 [7,] 6.498516 7.514930 1.0164144 0.005232757
 [8,] 6.482822 7.498273 1.0154506 0.005325144
 [9,] 6.459053 7.484009 1.0249562 0.005351269
[10,] 6.441244 7.471300 1.0300555 0.005228238
[11,] 6.423310 7.459754 1.0364441 0.005024080
[12,] 6.407989 7.448861 1.0408729 0.004841184
[13,] 6.393568 7.438572 1.0450047 0.004865372
[14,] 6.383645 7.428971 1.0453255 0.004559727
[15,] 6.370427 7.419974 1.0495463 0.004654894
fviz_gap_stat(gap_stat)

# Compute k-means clustering with k = 3
set.seed(123)
final <- kmeans(data_scaled, centers = 2, nstart = 25)
print(final)
K-means clustering with 2 clusters of sizes 189, 380

Cluster means:
  radius_mean texture_mean perimeter_mean  area_mean smoothness_mean
1   0.9731199    0.4810905      1.0057496  0.9626801       0.6087185
2  -0.4839991   -0.2392792     -0.5002281 -0.4788067      -0.3027573
  compactness_mean concavity_mean concave.points_mean symmetry_mean
1        1.0197987       1.138428           1.1635583     0.6106013
2       -0.5072157      -0.566218          -0.5787172    -0.3036938
  fractal_dimension_mean  radius_se  texture_se perimeter_se    area_se
1              0.2520081  0.8578415  0.04270321    0.8595226  0.8063982
2             -0.1253409 -0.4266633 -0.02123923   -0.4274994 -0.4010770
  smoothness_se compactness_se concavity_se concave.points_se symmetry_se
1    0.01704563      0.6944395    0.6363352         0.7755561   0.1402588
2   -0.00847796     -0.3453923   -0.3164930        -0.3857371  -0.0697603
  fractal_dimension_se radius_worst texture_worst perimeter_worst area_worst
1            0.4146673     1.039169     0.5058654       1.0650336  1.0022723
2           -0.2062424    -0.516850    -0.2516015      -0.5297141 -0.4984986
  smoothness_worst compactness_worst concavity_worst concave.points_worst
1        0.6077580         0.9500013       1.0433804             1.145203
2       -0.3022796        -0.4725007      -0.5189444            -0.569588
  symmetry_worst fractal_dimension_worst
1      0.5968910               0.6219221
2     -0.2968747              -0.3093244

Clustering vector:
  [1] 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [38] 2 2 2 2 2 1 2 2 1 2 1 2 2 2 2 2 1 2 2 1 1 2 2 2 2 1 2 1 1 2 2 1 2 1 2 1 2
 [75] 2 1 2 1 1 2 2 1 1 1 2 1 2 1 2 1 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2
[112] 2 1 2 2 2 2 1 1 2 2 1 1 2 2 2 2 1 1 1 2 1 1 2 1 2 2 2 1 2 2 1 2 2 2 2 1 2
[149] 2 2 2 2 1 2 2 2 1 2 2 2 2 1 1 2 1 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 1 1 2 2 2
[186] 2 2 2 2 2 1 2 2 1 1 2 1 1 1 1 2 1 1 1 2 2 2 2 2 2 1 2 1 1 1 1 2 2 1 1 2 2
[223] 2 1 2 2 2 2 2 1 1 2 2 1 2 2 1 1 2 1 2 2 1 2 1 2 2 2 2 2 1 2 1 1 1 2 1 1 1
[260] 1 1 2 1 2 1 1 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 1 2 2 2 2 2
[297] 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 2 1 2 2 2 2 1 1 1 2 2
[334] 2 2 1 2 1 2 1 2 2 2 1 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 1
[371] 1 2 1 1 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 1 2 2 2 2 2 2
[408] 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 2 2 2 1 2 2
[445] 2 2 1 2 2 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2
[482] 2 2 2 2 1 2 1 2 2 2 2 1 2 2 2 2 2 1 1 2 1 2 1 1 2 2 2 2 1 2 2 1 2 2 2 1 1
[519] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[556] 2 2 2 2 2 2 2 1 1 1 1 1 1 2

Within cluster sum of squares by cluster:
[1] 6325.137 5249.946
 (between_SS / total_SS =  32.1 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
fviz_cluster(final, data = data_scaled)

5 Clustering DBScan

kNNdistplot(data_scaled, k = 4)
abline(h = 2, col = "red", lty = 2)

dbscan_model <- dbscan(data_scaled, eps = 2, minPts = 4)
dbscan_model$cluster
  [1] 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 2 0 0 0 0 0 0 0 2 2 2 2 2 0 2 2 0 0 2 0 0 0 0 0 0 0 0 2 0 2 0 0 0 0
 [75] 2 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 2 0 2 2 0 0 0 0 2 0 2 0 2 0 0 0 0 2 0 0 0
[112] 0 0 0 0 2 0 0 0 0 2 0 0 2 0 2 0 0 0 0 2 1 0 2 1 2 0 2 0 0 0 0 2 2 2 0 0 0
[149] 2 2 0 0 0 2 2 2 0 0 2 2 0 0 0 0 0 2 0 0 0 2 2 2 0 0 0 0 0 0 0 2 0 0 2 0 0
[186] 0 0 2 0 2 0 0 0 0 0 2 0 0 0 0 2 1 0 0 2 0 2 0 0 2 0 2 0 0 0 0 0 0 0 0 0 2
[223] 2 0 2 0 2 2 0 0 0 0 0 0 2 2 0 0 0 0 2 2 0 0 0 0 2 0 0 2 0 2 0 1 1 2 0 0 0
[260] 0 0 0 0 2 1 0 0 2 2 0 2 2 0 0 0 0 2 0 2 2 0 0 0 0 0 2 0 2 0 0 0 0 2 2 2 2
[297] 0 0 2 0 0 0 0 2 2 0 2 0 2 2 2 2 2 0 0 2 2 1 0 0 0 0 0 0 2 2 2 2 1 0 0 0 0
[334] 2 2 0 0 0 2 0 3 0 2 0 2 0 2 2 2 0 2 0 0 0 0 0 0 2 0 0 2 2 2 2 2 0 0 2 0 0
[371] 0 2 0 0 2 0 0 2 0 0 0 2 0 0 2 0 2 2 0 0 2 0 0 0 2 2 0 0 2 2 0 2 0 2 0 2 2
[408] 0 1 2 2 2 0 0 0 2 0 0 2 0 2 0 2 3 0 2 0 2 2 2 0 0 0 0 2 0 2 2 2 2 0 0 2 0
[445] 1 0 0 2 2 0 0 0 0 0 2 0 0 2 2 0 0 0 2 2 2 0 0 0 0 0 2 0 2 0 0 2 2 2 2 0 2
[482] 2 2 2 0 0 2 0 2 0 2 0 0 0 2 2 0 2 0 0 0 0 2 0 0 0 0 0 2 0 0 2 0 2 2 2 1 1
[519] 0 2 0 0 2 2 0 0 0 2 0 2 2 0 2 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 2 0 2 0 2 0 2
[556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0
fviz_cluster(dbscan_model, data = data_scaled, stand = FALSE)

db_result <- dbscan(data_scaled, eps = 2, minPts = 4)
fviz_cluster(list(data = data_scaled, cluster = db_result$cluster),
             main = "DBSCAN Clustering")

silhouette_dbscan <- silhouette(db_result$cluster, dist(data_scaled))
silhouette_score_dbscan <- mean(silhouette_dbscan[, 3])  

noise_points_dbscan <- sum(db_result$cluster == 0)

cat("Silhouette Score DBSCAN:", silhouette_score_dbscan, "\n")
Silhouette Score DBSCAN: -0.1745369 

6 Clustering Mean Shift

set.seed(123)
ms_model <- meanShift(as.matrix(data_pca), bandwidth = c(3, 3))
head(ms_model$assignment)
     [,1]
[1,]    1
[2,]    1
[3,]    1
[4,]    1
[5,]    1
[6,]    1

  1   2   3   4 
561   6   1   1 

7 K terbaik Untuk Fuzzy C Means & K Median

fviz_nbclust(data_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method")

fviz_nbclust(data_scaled, kmeans, method = "silhouette") +
  labs(title = "Silhouette Method")

set.seed(123)
gap_stat <- clusGap(data_scaled, FUN = kmeans, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) +
  labs(title = "Gap Statistic")

k_best <- 2

8 CLustering Fuzzy C Means

set.seed(123)

res.fcm <- fcm(
  data_scaled,
  centers  = k_best,
  nstart   = 10,
  iter.max = 200,
  m        = 2,
  con.val  = 1e-6
)
cluster_fcm <- res.fcm$cluster
print(table(cluster_fcm))
cluster_fcm
  1   2 
370 199 
head(res.fcm$u)
  Cluster 1 Cluster 2
1 0.2951839 0.7048161
2 0.3413579 0.6586421
3 0.1311350 0.8688650
4 0.4116690 0.5883310
5 0.2662053 0.7337947
6 0.3768976 0.6231024
sil_fcm <- silhouette(cluster_fcm, dist(data_scaled))
avg_sil  <- mean(sil_fcm[, 3])
fviz_silhouette(sil_fcm, palette = "jco", ggtheme = theme_minimal()) +
  labs(title = "Silhouette Plot — FCM")
  cluster size ave.sil.width
1       1  370          0.45
2       2  199          0.13

fviz_cluster(
  list(data = data_scaled, cluster = cluster_fcm),
  ellipse.type = "convex",
  palette      = "jco",
  repel        = TRUE,
  ggtheme      = theme_minimal(),
  main         = paste("FCM Clustering — k =", k_best)
)

membership_df <- as.data.frame(res.fcm$u)
colnames(membership_df) <- paste0("Cluster_", 1:k_best)

ggplot(membership_df, aes(x = Cluster_1, y = Cluster_2)) +
  geom_point(aes(color = as.factor(cluster_fcm)), alpha = 0.6, size = 2) +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "red") +
  geom_hline(yintercept = 0.5, linetype = "dashed", color = "red") +
  scale_color_manual(values = c("steelblue", "darkorange"),
                     name = "Cluster") +
  labs(
    title    = "FCM — Membership Degree",
    subtitle = "Titik dekat garis merah = observasi ambigu",
    x        = "Derajat keanggotaan ke Cluster 1",
    y        = "Derajat keanggotaan ke Cluster 2"
  ) +
  theme_minimal()

fviz_cluster(
  list(data = data_scaled, cluster = cluster_fcm),
  geom = "point",
  ellipse.type = "norm",
  palette = "jco",
  ggtheme = theme_minimal(),
  main = paste("FCM Clustering — k =", k_best)
)

9 Clustering K Median

set.seed(123)

res.kmedians <- kcca(
  data_scaled,
  k        = k_best,
  family   = kccaFamily("kmedians"),
  control  = list(iter.max = 200)
)
cluster_kmed <- clusters(res.kmedians)
print(table(cluster_kmed))
cluster_kmed
  1   2 
196 373 
sil_kmed     <- silhouette(cluster_kmed, dist(data_scaled))
avg_sil_kmed <- mean(sil_kmed[, 3])
cat("Silhouette Width K-Medians:", round(avg_sil_kmed, 4), "\n")
Silhouette Width K-Medians: 0.3347 
fviz_silhouette(sil_kmed, palette = "jco", ggtheme = theme_minimal()) +
  labs(title = "Silhouette Plot — K-Medians")
  cluster size ave.sil.width
1       1  196          0.15
2       2  373          0.43

fviz_cluster(
  list(data = data_scaled, cluster = cluster_kmed),
  ellipse.type = "convex",
  palette      = "jco",
  repel        = TRUE,
  ggtheme      = theme_minimal(),
  main         = paste("K-Medians Clustering — k =", k_best)
)

fviz_cluster(
  list(data = data_scaled, cluster = cluster_kmed),
  geom         = "point",
  ellipse.type = "norm",
  palette      = "jco",
  ggtheme      = theme_minimal(),
  main         = paste("K-Medians Clustering — k =", k_best)
)