Analisis Metode 2: K-Means Clustering
Statistika Multivariat - Project
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.3
## Loading required package: xts
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
## Warning: package 'ggpubr' was built under R version 4.3.3
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.95 loaded
## Warning: package 'readxl' was built under R version 4.3.3
1 Data
## # A tibble: 6 × 10
## Perusahaan Tahun ROA DER OCFR AGE SIZE INST_OWN MNJ_OWN Y
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ADRO 2018 0.0676 0.641 0.328 14 32.3 43.9 12.4 2
## 2 ADRO 2019 0.0603 0.812 0.284 15 32.2 43.9 12.4 2
## 3 ADRO 2020 0.0248 0.615 0.304 16 32.1 43.9 12.4 2
## 4 ADRO 2021 0.136 0.702 0.459 17 32.3 43.9 12.4 2
## 5 AKRA 2018 0.0333 1.01 -0.0448 41 30.6 59.6 0.676 2
## 6 AKRA 2019 0.0327 1.13 0.0607 42 30.7 59.6 0.676 2
1.1 Struktur Data
## tibble [200 × 10] (S3: tbl_df/tbl/data.frame)
## $ Perusahaan: chr [1:200] "ADRO" "ADRO" "ADRO" "ADRO" ...
## $ Tahun : num [1:200] 2018 2019 2020 2021 2018 ...
## $ ROA : num [1:200] 0.0676 0.0603 0.0248 0.1356 0.0333 ...
## $ DER : num [1:200] 0.641 0.812 0.615 0.702 1.009 ...
## $ OCFR : num [1:200] 0.3285 0.2837 0.304 0.4591 -0.0448 ...
## $ AGE : num [1:200] 14 15 16 17 41 42 43 44 34 35 ...
## $ SIZE : num [1:200] 32.3 32.2 32.1 32.3 30.6 ...
## $ INST_OWN : num [1:200] 43.9 43.9 43.9 43.9 59.6 ...
## $ MNJ_OWN : num [1:200] 12.4 12.403 12.396 12.386 0.675 ...
## $ Y : num [1:200] 2 2 2 2 2 2 2 2 0 0 ...
1.2 Mengambil Variabel Numerik
## tibble [200 × 7] (S3: tbl_df/tbl/data.frame)
## $ ROA : num [1:200] 0.0676 0.0603 0.0248 0.1356 0.0333 ...
## $ DER : num [1:200] 0.641 0.812 0.615 0.702 1.009 ...
## $ OCFR : num [1:200] 0.3285 0.2837 0.304 0.4591 -0.0448 ...
## $ AGE : num [1:200] 14 15 16 17 41 42 43 44 34 35 ...
## $ SIZE : num [1:200] 32.3 32.2 32.1 32.3 30.6 ...
## $ INST_OWN: num [1:200] 43.9 43.9 43.9 43.9 59.6 ...
## $ MNJ_OWN : num [1:200] 12.4 12.403 12.396 12.386 0.675 ...
1.3 Pengecekan NA
## [1] TRUE
## ROA DER OCFR AGE SIZE INST_OWN MNJ_OWN
## 0 0 0 0 0 1 0
1.4 Imputasi NA
# Memilih variabel numerik untuk PCA
index <- index %>%
mutate(across(.fns = ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(.fns = ~ifelse(is.na(.), mean(., na.rm = TRUE), .))`.
## Caused by warning:
## ! Using `across()` without supplying `.cols` was deprecated in dplyr 1.1.0.
## ℹ Please supply `.cols` instead.
## ROA DER OCFR AGE SIZE INST_OWN MNJ_OWN
## 0 0 0 0 0 0 0
1.5 Eksplorasi Korelasi
# Correlation Chart (lebih rapi & bersih)
chart.Correlation(index,
histogram = TRUE,
pch = 19,
method = "pearson",
main = "Correlation Matrix – Financial Variables")## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
corr_matrix <- cor(index)
corrplot(corr_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
number.cex = 0.8,
col = colorRampPalette(c("red","white","blue"))(200),
title = "Correlation Heatmap – Financial Variables",
mar = c(0,0,2,0))
## Standarisasi Data
## ROA DER OCFR AGE SIZE INST_OWN MNJ_OWN
## 1 1 1 1 1 1 1
2 Matriks
# Ambil 6 observasi teratas
data_top6 <- data_standar[1:6, ]
jarak_top6 <- dist(data_top6, method = "euclidean")
matriks_jarak_top6 <- as.matrix(jarak_top6)
matriks_jarak_top6## 1 2 3 4 5 6
## 1 0.0000000 0.3369139 0.6817737 1.0604982 8.7787659 9.0672502
## 2 0.3369139 0.0000000 0.3796829 0.8470265 8.4568405 8.7470102
## 3 0.6817737 0.3796829 0.0000000 0.7601952 8.1420910 8.4302146
## 4 1.0604982 0.8470265 0.7601952 0.0000000 7.9054236 8.1793346
## 5 8.7787659 8.4568405 8.1420910 7.9054236 0.0000000 0.3973835
## 6 9.0672502 8.7470102 8.4302146 8.1793346 0.3973835 0.0000000
3 Penerapan K- Means
3.1 Penentuan Cluster Optimal
#Dalam menentukan jumlah cluster yang optimal pada K-Means, maka akan
menggunakan metode Elbow dan Silhoutte.
#Metode Elbow merupakan suatu metode yang digunakan untuk menghasilkan informasi dalam menentukan jumlah cluster terbaik dengan cara melihat persentase hasil perbandingan antara jumlah cluster yang akan membentuk siku pada suatu titik. Sedangkan metode silhoutte untuk menduga kualitas dari klaster yang terbentuk. Semakin tinggi nilai rata-ratanya maka cluster yang akan dibentuk semakin baik. didapat metode Jumlah cluster optimal menurut metode Silhouette: 3 Ini berbeda dari metode Elbow (yang sebelumnya menunjukkan k = 6). Maka dari itu kami memilih Jumlah cluster optimal dipilih k = 3 karena metode Silhouette menghasilkan skor pemisahan antar-cluster yang paling tinggi pada k tersebut, menunjukkan bahwa struktur kelompok dalam data lebih jelas, stabil, dan memiliki pemisahan antar-cluster yang kuat dibandingkan jumlah cluster lainnya.
3.2 Outlier Detection
## row col
## [1,] 18 1
## [2,] 19 1
## [3,] 126 1
## [4,] 134 1
## [5,] 13 2
## [6,] 19 2
## [7,] 43 2
## [8,] 65 2
## [9,] 66 2
## [10,] 48 3
## [11,] 116 3
## [12,] 120 3
## [13,] 129 3
## [14,] 1 4
## [15,] 2 4
## [16,] 3 4
## [17,] 4 4
## [18,] 46 7
## [19,] 47 7
## [20,] 48 7
## [21,] 77 7
## [22,] 78 7
## [23,] 79 7
## [24,] 80 7
## [25,] 133 7
## [26,] 134 7
## [27,] 135 7
3.3 Tangani Outlier
z <- scale(index)
outlier_rows <- unique(which(abs(z) > 3, arr.ind = TRUE)[,1])
# hapus outlier
index_clean <- index[-outlier_rows, ]
nrow(index_clean)## [1] 176
3.4 Analisis cluster
## K-means clustering with 3 clusters of sizes 48, 88, 40
##
## Cluster means:
## ROA DER OCFR AGE SIZE INST_OWN MNJ_OWN
## 1 0.04212542 1.284081 0.2748146 37.45833 29.51236 61.23010 0.6947896
## 2 0.04932500 1.194316 0.3357636 36.93182 28.55865 83.80730 0.5620648
## 3 -0.02844750 1.917768 0.1074425 37.00000 29.11819 35.47781 4.1260425
##
## Clustering vector:
## [1] 1 1 1 1 2 2 2 2 1 3 3 3 3 2 2 2 2 1 1 1 3 1 1 3 3 2 2 2 2 3 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 3 3 3 1 3 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 1 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 3 3 3 3 2 2 2 2 2 2 1 1 2 2 1 1 2 2 2 2
## [112] 1 2 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 3 3 3 2 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2924.810 3795.979 5289.408
## (between_SS / total_SS = 84.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
3.5 Cluster Plot
## Silhouette Analysis
silhouette_plot <- silhouette(klaster$cluster, dist(index_clean))
fviz_silhouette(silhouette_plot, print.summary = TRUE) +
ggtitle("Silhouette Plot - Kualitas Setiap Cluster")## cluster size ave.sil.width
## 1 1 48 0.51
## 2 2 88 0.63
## 3 3 40 0.44
# Hitung average silhouette width
avg_silhouette <- mean(silhouette_plot[, "sil_width"])
cat("Average Silhouette Width:", round(avg_silhouette, 3), "\n")## Average Silhouette Width: 0.555
## (Nilai 0.5-1.0 = Struktur cluster kuat)
3.6 Cluster Profiling
# Tambahkan cluster assignment ke data asli
index_clean_with_cluster <- index_clean %>%
mutate(Cluster = as.factor(klaster$cluster))
# Ringkas karakteristik setiap cluster
cluster_profile <- index_clean_with_cluster %>%
group_by(Cluster) %>%
summarise(
n_observasi = n(),
ROA_mean = mean(ROA, na.rm = TRUE),
ROA_sd = sd(ROA, na.rm = TRUE),
DER_mean = mean(DER, na.rm = TRUE),
DER_sd = sd(DER, na.rm = TRUE),
OCFR_mean = mean(OCFR, na.rm = TRUE),
OCFR_sd = sd(OCFR, na.rm = TRUE),
AGE_mean = mean(AGE, na.rm = TRUE),
AGE_sd = sd(AGE, na.rm = TRUE),
SIZE_mean = mean(SIZE, na.rm = TRUE),
SIZE_sd = sd(SIZE, na.rm = TRUE),
INST_OWN_mean = mean(INST_OWN, na.rm = TRUE),
INST_OWN_sd = sd(INST_OWN, na.rm = TRUE),
MNJ_OWN_mean = mean(MNJ_OWN, na.rm = TRUE),
MNJ_OWN_sd = sd(MNJ_OWN, na.rm = TRUE)
)
print(cluster_profile)## # A tibble: 3 × 16
## Cluster n_observasi ROA_mean ROA_sd DER_mean DER_sd OCFR_mean OCFR_sd AGE_mean
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 48 0.0421 0.0935 1.28 2.82 0.275 0.318 37.5
## 2 2 88 0.0493 0.115 1.19 1.41 0.336 0.383 36.9
## 3 3 40 -0.0284 0.155 1.92 3.87 0.107 0.205 37
## # ℹ 7 more variables: AGE_sd <dbl>, SIZE_mean <dbl>, SIZE_sd <dbl>,
## # INST_OWN_mean <dbl>, INST_OWN_sd <dbl>, MNJ_OWN_mean <dbl>,
## # MNJ_OWN_sd <dbl>
3.7 Boxplot by Cluster
index_clean_with_cluster %>%
pivot_longer(-Cluster, names_to = "Variable", values_to = "Value") %>%
ggplot(aes(x = Cluster, y = Value, fill = Cluster)) +
geom_boxplot() +
facet_wrap(~Variable, scales = "free_y", ncol = 3) +
theme_minimal() +
theme(legend.position = "bottom") +
ggtitle("Distribution of Variables by Cluster")3.8 Heatmap Cluster Center
cluster_centers <- klaster$centers
rownames(cluster_centers) <- paste0("Cluster ", 1:3)
pheatmap(cluster_centers,
display_numbers = TRUE,
number_format = "%.3f",
main = "Cluster Centers (Data Asli)",
color = colorRampPalette(c("blue", "white", "red"))(50),
breaks = seq(min(cluster_centers), max(cluster_centers), length.out = 51))3.9 Anova Test
# Uji apakah ada perbedaan signifikan antar cluster untuk setiap variabel
variables <- colnames(index_clean)[1:7]
anova_results <- data.frame(
Variable = character(),
F_statistic = numeric(),
P_value = numeric(),
Significant = character()
)
for (var in variables) {
formula <- as.formula(paste(var, "~ Cluster"))
anova_test <- aov(formula, data = index_clean_with_cluster)
f_stat <- summary(anova_test)[[1]]$`F value`[1]
p_val <- summary(anova_test)[[1]]$`Pr(>F)`[1]
anova_results <- rbind(anova_results, data.frame(
Variable = var,
F_statistic = round(f_stat, 3),
P_value = round(p_val, 6),
Significant = ifelse(p_val < 0.05, "YES (p<0.05)", "NO (p≥0.05)")
))
}
print(anova_results)## Variable F_statistic P_value Significant
## 1 ROA 6.130 0.002677 YES (p<0.05)
## 2 DER 1.149 0.319351 NO (p≥0.05)
## 3 OCFR 6.490 0.001915 YES (p<0.05)
## 4 AGE 6.037 0.002923 YES (p<0.05)
## 5 SIZE 6.922 0.001283 YES (p<0.05)
## 6 INST_OWN 727.666 0.000000 YES (p<0.05)
## 7 MNJ_OWN 13.317 0.000004 YES (p<0.05)
##
## (Jika p-value < 0.05, ada perbedaan signifikan antar cluster)