library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
## Warning: package 'cluster' was built under R version 4.5.3
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.5.3
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(dbscan)
## Warning: package 'dbscan' was built under R version 4.5.3
##
## Attaching package: 'dbscan'
##
## The following object is masked from 'package:stats':
##
## as.dendrogram
library(flexclust)
## Warning: package 'flexclust' was built under R version 4.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.5.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:flexclust':
##
## bclust
##
## The following object is masked from 'package:ggplot2':
##
## element
library(fpc)
## Warning: package 'fpc' was built under R version 4.5.3
##
## Attaching package: 'fpc'
##
## The following object is masked from 'package:dbscan':
##
## dbscan
library(meanShiftR)
df <- read_csv("C:/Users/Lenovo/Downloads/WA_Fn-UseC_-HR-Employee-Attrition.csv")
## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 35
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 41 Yes Travel_Rarely 1102 Sales 1 2
## 2 49 No Travel_Freque… 279 Research … 8 1
## 3 37 Yes Travel_Rarely 1373 Research … 2 2
## 4 33 No Travel_Freque… 1392 Research … 3 4
## 5 27 No Travel_Rarely 591 Research … 2 1
## 6 32 No Travel_Freque… 1005 Research … 2 2
## # ℹ 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## # PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## # RelationshipSatisfaction <dbl>, StandardHours <dbl>, …
str(df)
## spc_tbl_ [1,470 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1470] "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num [1:1470] 1102 279 1373 1392 591 ...
## $ Department : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1470] "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1470] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1470] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1470] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr [1:1470] "Y" "Y" "Y" "Y" ...
## $ OverTime : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. Department = col_character(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. EducationField = col_character(),
## .. EmployeeCount = col_double(),
## .. EmployeeNumber = col_double(),
## .. EnvironmentSatisfaction = col_double(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. JobLevel = col_double(),
## .. JobRole = col_character(),
## .. JobSatisfaction = col_double(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. Over18 = col_character(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(df)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1470 Length:1470 Min. : 102.0
## 1st Qu.:30.00 Class :character Class :character 1st Qu.: 465.0
## Median :36.00 Mode :character Mode :character Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1470 Min. : 1.000 Min. :1.000 Length:1470
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1.0 Min. :1.000 Length:1470
## 1st Qu.:1 1st Qu.: 491.2 1st Qu.:2.000 Class :character
## Median :1 Median :1020.5 Median :3.000 Mode :character
## Mean :1 Mean :1024.9 Mean :2.722
## 3rd Qu.:1 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :1 Max. :2068.0 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1470
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.89 Mean :2.73 Mean :2.064
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1470 Min. : 1009 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 2911 1st Qu.: 8047
## Median :3.000 Mode :character Median : 4919 Median :14236
## Mean :2.729 Mean : 6503 Mean :14313
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462
## Max. :4.000 Max. :19999 Max. :26999
## NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. :0.000 Length:1470 Length:1470 Min. :11.00
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:12.00
## Median :2.000 Mode :character Mode :character Median :14.00
## Mean :2.693 Mean :15.21
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.154 Mean :2.712 Mean :80 Mean :0.7939
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
df_num <- df %>% #variabel numerik
select(where(is.numeric))
df_num <- df_num %>% #missing value
mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
df_num <- df_num[, apply(df_num, 2, var) != 0]
df_scaled <- scale(df_num) #scalling data
fviz_nbclust(df_scaled, kmeans, method = "wss", k.max = 10) +
ggtitle("Elbow Method")
Grafik Elbow Method menunjukkan bahwa penurunan nilai Within Sum of Square cukup tajam hingga k = 3, lalu mulai melandai setelahnya. Hal ini menandakan bahwa jumlah cluster optimal berada di sekitar k = 3 atau 4, karena penambahan cluster setelah titik tersebut tidak lagi memberikan perbaikan signifikan.
fviz_nbclust(df_scaled, kmeans, method = "silhouette", k.max = 10) +
ggtitle("Silhouette Method")
Grafik Silhouette Method menunjukkan bahwa nilai average silhouette width tertinggi ada pada k = 2. Artinya, pembagian data menjadi 2 cluster memberikan hasil pemisahan yang paling baik dan paling jelas dibanding jumlah cluster lainnya.
set.seed(123)
kmeans_res <- kmeans(df_scaled, centers = 3, nstart = 25)
fviz_cluster(kmeans_res, data = df_scaled,
ellipse.type = "norm",
geom = "point")
Visualisasi K-Means Clustering menunjukkan bahwa data terbagi menjadi tiga kelompok yang berbeda dengan batas yang cukup jelas. Cluster 3 memiliki nilai tinggi pada JobLevel, MonthlyIncome, dan TotalWorkingYears sehingga merepresentasikan karyawan senior. Cluster 2 unggul pada PerformanceRating dan PercentSalaryHike yang menunjukkan performa tinggi. Cluster 1 dan 2 cenderung berisi karyawan dengan pengalaman dan pendapatan lebih rendah.
kmedian_res <- kcca(df_scaled, k = 3, family = kccaFamily("kmedians"))
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
## Found more than one class "kcca" in cache; using the first, from namespace 'flexclust'
## Also defined by 'kernlab'
fviz_cluster(list(data = df_scaled, cluster = clusters(kmedian_res)),
ellipse.type = "norm",
geom = "point")
Visualisasi K-Median Clustering memperlihatkan data terbagi menjadi tiga kelompok dengan sedikit tumpang tindih antar cluster. Hal ini menunjukkan bahwa meskipun pembagian cluster cukup jelas, ada beberapa data yang memiliki kemiripan karakteristik dengan cluster lain.
dbscan_res <- dbscan(df_scaled, eps = 2, MinPts = 5)
fviz_cluster(dbscan_res, data = df_scaled,
geom = "point")
table(dbscan_res$cluster)
##
## 0
## 1470
Visualisasi DBSCAN Clustering memperlihatkan bahwa algoritma ini mengelompokkan data berdasarkan kepadatan titik, bukan jumlah cluster yang ditentukan sebelumnya. Terlihat area pusat yang lebih padat membentuk kelompok, sementara titik-titik di luar yang jarang dianggap sebagai noise.Hasil dari table(dbscan_res$cluster) adalah 0 = 1470 artinya seluruh data dianggap noise dan tidak terbentuk cluster.Hal ini terjadi karena data berdimensi tinggi sehingga jarak antar titik menjadi tidak signifikan.
ms_res <- meanShift(df_scaled)
pca <- prcomp(df_scaled)
plot(pca$x[,1:2], col = ms_res$assignment,
pch = 19)
Visualisasi menggunakan PCA menunjukkan persebaran data berdasarkan cluster tersebut. Jumlah cluster yang terbentuk tidak ditentukan di awal dan bisa bervariasi. Hasil menunjukkan setiap data menjadi cluster sendiri, sehingga terbentuk sangat banyak cluster. Hal ini menandakan metode gagal menemukan struktur kelompok yang jelas. Oleh karena itu, Mean Shift tidak cocok digunakan pada dataset ini.
fcm_res <- cmeans(df_scaled, centers = 3, m = 2)
fviz_cluster(list(data = df_scaled, cluster = fcm_res$cluster),
ellipse.type = "norm",
geom = "point")
Visualisasi Fuzzy C-Means memperlihatkan bahwa data terbagi menjadi dua cluster dengan batas yang saling tumpang tindih. Hal ini menunjukkan adanya data yang memiliki keanggotaan pada lebih dari satu cluster, sehingga metode ini mampu menangkap kemiripan antar kelompok secara lebih detail.
dist_matrix <- dist(df_scaled)
sil_kmeans <- mean(silhouette(kmeans_res$cluster, dist_matrix)[,3])
sil_kmedian <- mean(silhouette(clusters(kmedian_res), dist_matrix)[,3])
db_cluster <- dbscan_res$cluster
if(length(unique(db_cluster)) > 1){
idx <- which(db_cluster != 0)
if(length(idx) > 1){
df_no_noise <- df_scaled[idx, ]
cluster_no_noise <- db_cluster[idx]
if(length(unique(cluster_no_noise)) > 1){
dist_db <- dist(df_no_noise)
sil_dbscan <- mean(silhouette(cluster_no_noise, dist_db)[,3])
} else {
sil_dbscan <- NA
}
} else {
sil_dbscan <- NA
}
} else {
sil_dbscan <- NA
}
sil_fcm <- mean(silhouette(fcm_res$cluster, dist_matrix)[,3])
print(sil_kmeans)
## [1] 0.125712
print(sil_kmedian)
## [1] 0.1111544
print(sil_dbscan)
## [1] NA
print(sil_fcm)
## [1] 0.121812
Berdasarkan hasil perhitungan, metode K-Means memiliki nilai silhouette tertinggi (0.1257) dibandingkan metode lainnya, sehingga menghasilkan kualitas cluster terbaik. Metode Fuzzy C-Means dan K-Median memiliki nilai yang sedikit lebih rendah, namun masih cukup mendekati. Sementara itu, DBSCAN tidak dapat dievaluasi karena tidak menghasilkan cluster (nilai NA).Nilai silhouette yang relatif kecil (< 0.5) menunjukkan bahwa pemisahan antar cluster masih belum terlalu kuat.Namun, K-Means tetap menjadi metode paling optimal dibandingkan metode lainnya pada dataset ini.
dunn_kmeans <- cluster.stats(dist_matrix, kmeans_res$cluster)$dunn
dunn_kmedian <- cluster.stats(dist_matrix, clusters(kmedian_res))$dunn
db_cluster <- dbscan_res$cluster
if(length(unique(db_cluster)) > 1){
idx <- which(db_cluster != 0)
if(length(idx) > 1){
df_no_noise <- df_scaled[idx, ]
cluster_no_noise <- db_cluster[idx]
if(length(unique(cluster_no_noise)) > 1){
dist_db <- dist(df_no_noise)
dunn_dbscan <- cluster.stats(dist_db, cluster_no_noise)$dunn
} else {
dunn_dbscan <- NA
}
} else {
dunn_dbscan <- NA
}
} else {
dunn_dbscan <- NA
}
dunn_fcm <- cluster.stats(dist_matrix, fcm_res$cluster)$dunn
## Warning in cluster.stats(dist_matrix, fcm_res$cluster): clustering renumbered
## because maximum != number of clusters
Nilai Dunn Index menunjukkan bahwa metode K-Means memiliki nilai tertinggi (0.2204), diikuti K-Median yang memiliki nilai hampir sama. Sementara itu, Fuzzy C-Means memiliki nilai lebih rendah, sehingga kualitas pemisahan cluster kurang optimal. DBSCAN tidak dapat dievaluasi karena tidak menghasilkan cluster.
hasil_eval <- data.frame(
Metode = c("K-Means","K-Median","DBSCAN","FCM"),
Silhouette = c(sil_kmeans, sil_kmedian, sil_dbscan, sil_fcm),
Dunn = c(dunn_kmeans, dunn_kmedian, dunn_dbscan, dunn_fcm)
)
print(hasil_eval)
## Metode Silhouette Dunn
## 1 K-Means 0.1257120 0.2203805
## 2 K-Median 0.1111544 0.2197632
## 3 DBSCAN NA NA
## 4 FCM 0.1218120 0.1859636
Berdasarkan tabel evaluasi, metode K-Means memiliki nilai Silhouette dan Dunn Index tertinggi dibandingkan metode lainnya. Hal ini menunjukkan bahwa K-Means mampu menghasilkan cluster yang paling kompak dan terpisah dengan baik. Oleh karena itu, K-Means dipilih sebagai metode terbaik dalam penelitian ini.
best <- hasil_eval[which.max(hasil_eval$Silhouette), ]
print(best)
## Metode Silhouette Dunn
## 1 K-Means 0.125712 0.2203805
Hasil pemilihan metode terbaik menunjukkan bahwa K-Means menjadi metode optimal dengan nilai silhouette sebesar 0.1257. Hal ini menegaskan bahwa K-Means paling mampu merepresentasikan struktur data dibandingkan metode lainnya. Dengan demikian, hasil clustering menggunakan K-Means digunakan sebagai dasar analisis lebih lanjut.
Berdasarkan hasil analisis clustering pada dataset, metode K-Means, K-Median, dan Fuzzy C-Means mampu membentuk cluster, sedangkan DBSCAN dan Mean Shift tidak memberikan hasil yang optimal. Evaluasi menggunakan Silhouette Score dan Dunn Index menunjukkan bahwa metode K-Means memiliki performa terbaik dengan nilai evaluasi tertinggi. Hal ini menunjukkan bahwa K-Means paling efektif dalam mengelompokkan data dengan struktur yang ada. Oleh karena itu, metode K-Means dipilih sebagai metode terbaik untuk merepresentasikan pola dalam dataset.