library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(clustMixType)
## Warning: package 'clustMixType' was built under R version 4.4.3
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readr)
heart <- read.csv(file.choose(), header = TRUE)
head(heart)
## player_name position Age nationality height foot maret_value club_name
## 1 Bayu_Aji CB NA Lokal 178 NA 10 Arema_FC
## 2 Syaeful_Anwar CB 28 Lokal 184 1 125 Arema_FC
## 3 Bagas_Adi CB 26 Lokal 176 2 200 Arema_FC
## 4 Ariel_Lucero AM 24 Asing 170 2 250 Arema_FC
## 5 Arkhan_Fikri AM 18 Lokal 165 1 150 Arema_FC
## 6 Evan_Dimas CM 28 Lokal 167 3 250 Arema_FC
Konversi dilakukan agar variable menjadi tipe data yang sesuai. Hal ini untuk Memastikan variabel kategorik dibaca sebagai faktor agar bisa diproses oleh k-prototypes.
heart$player_name <- as.character(heart$player_name)
heart$position <- factor(heart$position)
heart$nationality <- factor(heart$nationality)
heart$foot <- factor(heart$foot)
heart$club_name <- factor(heart$club_name)
colSums(is.na(heart))
## player_name position Age nationality height foot
## 0 0 18 0 37 47
## maret_value club_name
## 64 0
# Imputasi nilai mean untuk variabel numerik
heart$Age[is.na(heart$Age)] <- base::mean(heart$Age, na.rm = TRUE)
heart$height[is.na(heart$height)] <- base::mean(heart$height, na.rm = TRUE)
heart$maret_value[is.na(heart$maret_value)] <- base::mean(heart$maret_value, na.rm = TRUE)
# Imputasi modus untuk variabel kategorik foot
modus <- names(sort(table(heart$foot), decreasing = TRUE))[1]
heart$foot[is.na(heart$foot)] <- modus
Karena data yang didapat masih terdapat missing value pada beberapa variabelnya maka missing value tersebut harus diisi, dimana untuk data numerik diisi dengan nilai rata-rata dan data kategorik dengan nilai modus.
Standarisasi dilakukan dengan menggunakan Z Score yang bertujuan untuk Menyamakan skala variabel numerik agar tidak mendominasi hasil clustering.
heart$Age <- scale(heart$Age)
heart$height <- scale(heart$height)
heart$maret_value <- scale(heart$maret_value)
# Simpan parameter z-score untuk inverse transform
mean_Age <- attr(heart$Age, "scaled:center")
sd_Age <- attr(heart$Age, "scaled:scale")
mean_height <- attr(heart$height, "scaled:center")
sd_height <- attr(heart$height, "scaled:scale")
mean_mv <- attr(heart$maret_value,"scaled:center")
sd_mv <- attr(heart$maret_value, "scaled:scale")
Mengetahui jumlah cluster terbaik dengan melihat penurunan variasi dalam cluster.
k.max <- 10
wss <- sapply(1:k.max, function(k) {
kproto(heart, k, nstart = 10)$tot.withinss
})
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
# Plot Elbow
plot(1:k.max, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Jumlah Cluster (k)", ylab = "Total Within-Cluster Variation")
abline(v = 4, lty = 2) # contoh fix pada k=4
Diperoleh bahwa cluster optimal yaitu k =4
Selanjutnya Melakukan clustering menggunakan algoritma k-prototypes dengan k=4.Hal ini dilakukan untukMengelompokkan data ke dalam beberapa cluster dengan mempertimbangkan data campuran (numerik & kategorik).
kpres <- kproto(heart, 4)
## # NAs in variables:
## player_name position Age nationality height foot
## 0 0 0 0 0 0
## maret_value club_name
## 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.577157
heart$cluster <- kpres$cluster
heart$Age_asli <- heart$Age * sd_Age + mean_Age
heart$height_asli <- heart$height * sd_height + mean_height
heart$maret_value_asli <- heart$maret_value * sd_mv + mean_mv
# Persiapkan data untuk PCA campuran (numerik + kategori)
heart_cluster <- heart[, c("position", "Age", "height", "foot", "maret_value", "nationality", "club_name")]
res.famd <- FAMD(heart_cluster, ncp = 2, graph = FALSE)
# Gabungkan hasil cluster ke PCA
pca_df <- data.frame(res.famd$ind$coord)
pca_df$cluster <- as.factor(heart$cluster)
# Visualisasi PCA Cluster
ggplot(pca_df, aes(x = Dim.1, y = Dim.2, color = cluster)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "Visualisasi Klaster Berdasarkan PCA", x = "Dimensi 1", y = "Dimensi 2") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
heart %>%
group_by(cluster) %>%
summarise(
Usia_rata2 = round(mean(Age_asli), 1),
Tinggi_rata2 = round(mean(height_asli), 1),
Market_value_rata2 = round(mean(maret_value_asli), 2),
Mode_foot = names(sort(table(foot), decreasing = TRUE))[1],
Mode_position = names(sort(table(position), decreasing = TRUE))[1],
Mode_club_name = names(sort(table(club_name), decreasing = TRUE))[1],
Mode_nationality = names(sort(table(nationality), decreasing = TRUE))[1]
)
## # A tibble: 4 × 8
## cluster Usia_rata2 Tinggi_rata2 Market_value_rata2 Mode_foot Mode_position
## <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 1 21.6 176. 93.0 1 DM
## 2 2 29.7 176. 121. 1 GK
## 3 3 29.1 183. 261. 1 CB
## 4 4 25.1 169. 148. 2 LB
## # ℹ 2 more variables: Mode_club_name <chr>, Mode_nationality <chr>