data <- read.csv("laptop.csv")
cat("Dimensi data awal:", nrow(data), "baris,", ncol(data), "kolom\n")## Dimensi data awal: 1273 baris, 13 kolom
## Distribusi OS:
##
## Mac Others Windows
## 21 152 1100
Filter dataset hanya untuk laptop dengan OS = Windows.
# Filter hanya Windows
data <- data[data$Os == "Windows", ]
cat("\nData setelah filter Windows:", nrow(data), "laptop\n")##
## Data setelah filter Windows: 1100 laptop
# Buang data invalid (laptop tanpa storage)
invalid_idx <- which(data$SSD == 0 & data$HDD == 0)
cat("Data invalid (SSD=0 & HDD=0):", length(invalid_idx), "\n")## Data invalid (SSD=0 & HDD=0): 46
if(length(invalid_idx) > 0) {
data <- data[-invalid_idx, ]
}
cat("Data setelah buang invalid:", nrow(data), "laptop\n")## Data setelah buang invalid: 1054 laptop
# Pilih hanya fitur numerik yang relevan untuk clustering
features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")
data_cluster <- data[, features]
cat("Fitur yang digunakan:", paste(features, collapse = ", "), "\n")## Fitur yang digunakan: Ram, SSD, HDD, Weight, Ppi, Price
## Dimensi data clustering: 1054 x 6
## Tipe data per kolom:
## 'data.frame': 1054 obs. of 6 variables:
## $ Ram : int 4 16 8 4 8 8 8 8 8 4 ...
## $ SSD : int 0 512 256 256 256 128 128 256 256 0 ...
## $ HDD : int 500 0 0 0 0 0 1000 0 0 1000 ...
## $ Weight: num 2.1 1.3 1.6 2.2 2.2 1.22 2.5 1.62 1.91 2.3 ...
## $ Ppi : num 100 157 157 141 141 ...
## $ Price : num 9.97 11.29 10.62 10.19 10.59 ...
##
## Semua kolom numerik: TRUE
## Ram SSD HDD Weight
## Min. : 2.000 Min. : 0.0 Min. : 0.0 Min. :0.810
## 1st Qu.: 6.000 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:1.565
## Median : 8.000 Median : 256.0 Median : 0.0 Median :2.040
## Mean : 8.953 Mean : 210.5 Mean : 422.8 Mean :2.077
## 3rd Qu.: 8.000 3rd Qu.: 256.0 3rd Qu.:1000.0 3rd Qu.:2.360
## Max. :64.000 Max. :1024.0 Max. :2000.0 Max. :4.700
## Ppi Price
## Min. : 90.58 Min. : 9.409
## 1st Qu.:127.34 1st Qu.:10.550
## Median :141.21 Median :10.977
## Mean :148.55 Mean :10.944
## 3rd Qu.:157.35 3rd Qu.:11.353
## Max. :352.47 Max. :12.691
boxplot_data <- melt(data_cluster)
ggplot(boxplot_data, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(alpha = 0.8, outlier.color = "#E74C3C", outlier.size = 2) +
scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
labs(x = NULL, y = "Value",
title = "Boxplot - Deteksi Outlier",
subtitle = "Sebelum IQR Cleaning") +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_blank(),
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))ggplot(boxplot_data, aes(x = value, fill = variable)) +
geom_histogram(bins = 25, color = "white", alpha = 0.85) +
scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
labs(x = "Value", y = "Frequency",
title = "Histogram - Distribusi Fitur",
subtitle = "Melihat sebaran data tiap variabel") +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal() +
theme(legend.position = "none",
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))## Data sebelum IQR cleaning: 1054
# Hitung IQR untuk setiap kolom dalam dataset
Q1 <- apply(data_cluster, 2, quantile, probs = 0.25)
Q3 <- apply(data_cluster, 2, quantile, probs = 0.75)
IQR_val <- Q3 - Q1
# Tentukan batas atas dan batas bawah untuk setiap kolom
upper <- Q3 + 1.5 * IQR_val
lower <- Q1 - 1.5 * IQR_val
# Tampilkan batas
cat("Batas IQR:\n")## Batas IQR:
print(data.frame(Q1 = round(Q1, 2), Q3 = round(Q3, 2), IQR = round(IQR_val, 2),
Lower = round(lower, 2), Upper = round(upper, 2)))## Q1 Q3 IQR Lower Upper
## Ram 6.00 8.00 2.00 3.00 11.00
## SSD 0.00 256.00 256.00 -384.00 640.00
## HDD 0.00 1000.00 1000.00 -1500.00 2500.00
## Weight 1.56 2.36 0.79 0.37 3.55
## Ppi 127.34 157.35 30.01 82.31 202.37
## Price 10.55 11.35 0.80 9.35 12.56
# Hapus outlier dari setiap kolom dalam dataset
cleaned_cluster <- data_cluster
for (i in 1:ncol(data_cluster)) {
cleaned_cluster <- cleaned_cluster[cleaned_cluster[, i] >= lower[i] & cleaned_cluster[, i] <= upper[i], ]
}
cat("\nData setelah IQR cleaning:", nrow(cleaned_cluster), "\n")##
## Data setelah IQR cleaning: 780
## Outlier dihapus: 274
boxplot_clean <- melt(data_cluster)
ggplot(boxplot_clean, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("#27AE60", "#2ECC71", "#1ABC9C", "#16A085", "#138D75", "#0E6655")) +
labs(x = NULL, y = "Value",
title = "Boxplot - Setelah IQR Cleaning",
subtitle = "Data lebih bersih dari outlier") +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_blank(),
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))## Mean setelah scaling (harus ~0):
## Ram SSD HDD Weight Ppi Price
## 0 0 0 0 0 0
##
## Std Dev setelah scaling (harus ~1):
## Ram SSD HDD Weight Ppi Price
## 1 1 1 1 1 1
scaled_df <- as.data.frame(data_scaled)
scaled_melt <- melt(scaled_df)
ggplot(scaled_melt, aes(x = value, fill = variable)) +
geom_histogram(bins = 25, color = "white", alpha = 0.85) +
scale_fill_manual(values = c("#9B59B6", "#8E44AD", "#7D3C98", "#6C3483", "#5B2C6F", "#4A235A")) +
labs(x = "Scaled Value (Z-Score)", y = "Frequency",
title = "Distribusi Setelah Normalisasi",
subtitle = "Semua fitur dalam skala yang sama") +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal() +
theme(legend.position = "none",
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))## Correlation Matrix:
## Ram SSD HDD Weight Ppi Price
## Ram 1.00 0.46 -0.10 -0.04 0.39 0.60
## SSD 0.46 1.00 -0.69 -0.38 0.53 0.58
## HDD -0.10 -0.69 1.00 0.51 -0.40 -0.30
## Weight -0.04 -0.38 0.51 1.00 -0.55 -0.23
## Ppi 0.39 0.53 -0.40 -0.55 1.00 0.54
## Price 0.60 0.58 -0.30 -0.23 0.54 1.00
# Visualisasi korelasi dengan warna merah-putih-biru
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
number.cex = 0.9,
tl.col = "black",
tl.srt = 45,
tl.cex = 1.1,
col = colorRampPalette(c("#E74C3C", "#FFFFFF", "#3498DB"))(100),
title = "Korelasi Antar Fitur",
mar = c(0,0,2,0))pca_result <- prcomp(data_scaled)
# Variance explained
pca_var <- summary(pca_result)
print(pca_var)## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion 0.5254 0.7333 0.8501 0.91141 0.96575 1.00000
# Scree plot
fviz_eig(pca_result,
addlabels = TRUE,
barfill = "#3498DB",
barcolor = "#2980B9",
linecolor = "#E74C3C") +
labs(title = "PCA - Variance Explained",
subtitle = "Kontribusi setiap Principal Component",
x = "Principal Component",
y = "Percentage of Variance Explained") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))# Kontribusi fitur pada PC1 dan PC2
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#00B894", "#FDCB6E", "#E17055"),
repel = TRUE) +
labs(title = "PCA - Kontribusi Fitur",
subtitle = "Hubungan fitur dengan Dim1 & Dim2") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))jarak <- get_dist(data_scaled)
fviz_dist(jarak,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
labs(title = "Matriks Jarak Euclidean",
subtitle = "Semakin gelap = semakin jauh jaraknya") +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))set.seed(123)
k2 <- kmeans(data_scaled, centers = 2, nstart = 25)
cat("K=2 | Total within SS:", round(k2$tot.withinss, 2), "\n")## K=2 | Total within SS: 2851.79
## K=2 | Between/Total SS: 38.99 %
set.seed(123)
k3 <- kmeans(data_scaled, centers = 3, nstart = 25)
cat("K=3 | Total within SS:", round(k3$tot.withinss, 2), "\n")## K=3 | Total within SS: 2165.96
## K=3 | Between/Total SS: 53.66 %
fviz_nbclust(data_scaled, kmeans, method = "wss", nstart = 25) +
geom_vline(xintercept = 3, linetype = 2, color = "#E74C3C", linewidth = 1) +
labs(title = "Elbow Method - Menentukan K Optimal",
subtitle = "Elbow (siku) berada di K=3",
x = "Jumlah Cluster (K)",
y = "Total Within Sum of Squares (WSS)") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))fviz_nbclust(data_scaled, kmeans, method = "silhouette", nstart = 25) +
labs(title = "Silhouette Method - Menentukan K Optimal",
subtitle = "Silhouette tertinggi di K=3",
x = "Jumlah Cluster (K)",
y = "Average Silhouette Width") +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))comparison <- data.frame(
K = c(2, 3, 4),
Within_SS = c(k2$tot.withinss, k3$tot.withinss, k4$tot.withinss),
Between_Total = c(k2$betweenss / k2$totss, k3$betweenss / k3$totss, k4$betweenss / k4$totss)
)
cat("=== PERBANDINGAN K ===\n")## === PERBANDINGAN K ===
## K Within_SS Between_Total
## 1 2 2851.793 0.3898603
## 2 3 2165.956 0.5365949
## 3 4 1826.111 0.6093045
##
## Kesimpulan: K=3 dipilih berdasarkan Elbow Method dan Silhouette Score
sil_k3 <- silhouette(k3$cluster, jarak)
fviz_silhouette(sil_k3) +
labs(title = "Silhouette Plot - K=3",
subtitle = paste("Average Silhouette Width:", round(mean(sil_k3[, 3]), 3))) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"))## cluster size ave.sil.width
## 1 1 241 0.28
## 2 2 356 0.42
## 3 3 183 0.35
data$Cluster <- k3$cluster
fviz_cluster(k3, data = data_scaled,
geom = "point",
ellipse.type = "convex",
palette = colors_cluster,
ggtheme = theme_minimal(),
main = "") +
labs(title = "K-Means Clustering (K=3) - PCA Space",
subtitle = paste("Between_SS / Total_SS =",
round(k3$betweenss / k3$totss * 100, 1), "%"),
x = "Principal Component 1",
y = "Principal Component 2") +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
legend.position = "bottom")cluster_summary <- data %>%
group_by(Cluster) %>%
summarise(
n = n(),
Ram = mean(Ram),
SSD = mean(SSD),
HDD = mean(HDD),
Weight = mean(Weight),
Ppi = mean(Ppi),
Price = mean(Price)
)
cat("=== KARAKTERISTIK CLUSTER ===\n")## === KARAKTERISTIK CLUSTER ===
## # A tibble: 3 × 8
## Cluster n Ram SSD HDD Weight Ppi Price
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 241 4.18 58.3 473. 2.05 120. 10.3
## 2 2 356 7.61 270. 4.21 1.7 153. 11.0
## 3 3 183 7.81 66.4 1020. 2.38 131. 10.8
cluster_melt <- melt(cluster_summary[, -2], id.vars = "Cluster")
cluster_melt$Cluster <- as.factor(cluster_melt$Cluster)
ggplot(cluster_melt, aes(x = variable, y = value, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
scale_fill_manual(values = c("#00B894", "#0984E3", "#E17055")) +
labs(x = NULL, y = "Rata-rata Nilai",
title = "Perbandingan Karakteristik Antar Cluster",
subtitle = "Rata-rata nilai fitur per cluster") +
facet_wrap(~variable, scales = "free", ncol = 3) +
theme_minimal() +
theme(axis.text.x = element_blank(),
strip.text = element_text(face = "bold", size = 11),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
legend.position = "bottom")cluster_counts <- table(data$Cluster)
# Tentukan segment berdasarkan kombinasi ranking fitur kunci
rank_price <- rank(cluster_summary$Price)
rank_ram <- rank(cluster_summary$Ram)
rank_ssd <- rank(cluster_summary$SSD)
rank_ppi <- rank(cluster_summary$Ppi)
# Rata-rata ranking (semakin tinggi = semakin high-end)
avg_rank <- (rank_price + rank_ram + rank_ssd + rank_ppi) / 4
cat("=== SCORING SEGMENT ===\n")## === SCORING SEGMENT ===
## Cluster 1: Avg Rank = 1.00
## Cluster 2: Avg Rank = 2.75
## Cluster 3: Avg Rank = 2.25
for(i in 1:3) {
cat(sprintf("\n=== CLUSTER %d (%d laptops, %.1f%%) ===\n",
i, cluster_counts[i], 100 * cluster_counts[i] / sum(cluster_counts)))
cl <- cluster_summary[cluster_summary$Cluster == i, ]
cat(sprintf("Ram: %.1f GB | SSD: %.0f GB | HDD: %.0f GB\n", cl$Ram, cl$SSD, cl$HDD))
cat(sprintf("Weight: %.2f kg | Ppi: %.0f | Price: %.2f\n", cl$Weight, cl$Ppi, cl$Price))
# Label segmen berdasarkan rata-rata ranking
if(avg_rank[i] == max(avg_rank)) {
cat(">> SEGMENT: HIGH-END\n")
} else if(avg_rank[i] == min(avg_rank)) {
cat(">> SEGMENT: BUDGET/ENTRY-LEVEL\n")
} else {
cat(">> SEGMENT: MID-RANGE\n")
}
}##
## === CLUSTER 1 (241 laptops, 30.9%) ===
## Ram: 4.2 GB | SSD: 58 GB | HDD: 473 GB
## Weight: 2.05 kg | Ppi: 120 | Price: 10.30
## >> SEGMENT: BUDGET/ENTRY-LEVEL
##
## === CLUSTER 2 (356 laptops, 45.6%) ===
## Ram: 7.6 GB | SSD: 270 GB | HDD: 4 GB
## Weight: 1.70 kg | Ppi: 153 | Price: 11.05
## >> SEGMENT: HIGH-END
##
## === CLUSTER 3 (183 laptops, 23.5%) ===
## Ram: 7.8 GB | SSD: 66 GB | HDD: 1020 GB
## Weight: 2.38 kg | Ppi: 131 | Price: 10.78
## >> SEGMENT: MID-RANGE
for(i in 1:3) {
cat(sprintf("\nCluster %d - Top 5 Brand:\n", i))
brands <- sort(table(data[data$Cluster == i, "Company"]), decreasing = TRUE)[1:5]
for(j in 1:length(brands)) {
cat(sprintf(" %s: %d (%.1f%%)\n", names(brands)[j], brands[j],
100 * brands[j] / cluster_counts[i]))
}
}##
## Cluster 1 - Top 5 Brand:
## HP: 65 (27.0%)
## Dell: 50 (20.7%)
## Lenovo: 50 (20.7%)
## Acer: 37 (15.4%)
## Asus: 22 (9.1%)
##
## Cluster 2 - Top 5 Brand:
## HP: 107 (30.1%)
## Lenovo: 86 (24.2%)
## Dell: 78 (21.9%)
## Toshiba: 25 (7.0%)
## Asus: 24 (6.7%)
##
## Cluster 3 - Top 5 Brand:
## HP: 46 (25.1%)
## Dell: 45 (24.6%)
## Lenovo: 32 (17.5%)
## Asus: 29 (15.8%)
## MSI: 17 (9.3%)
for(i in 1:3) {
cat(sprintf("\nCluster %d - Type Distribution:\n", i))
types <- sort(table(data[data$Cluster == i, "TypeName"]), decreasing = TRUE)
for(j in 1:length(types)) {
cat(sprintf(" %s: %d (%.1f%%)\n", names(types)[j], types[j],
100 * types[j] / cluster_counts[i]))
}
}##
## Cluster 1 - Type Distribution:
## Notebook: 220 (91.3%)
## 2 in 1 Convertible: 11 (4.6%)
## Netbook: 5 (2.1%)
## Ultrabook: 4 (1.7%)
## Gaming: 1 (0.4%)
##
## Cluster 2 - Type Distribution:
## Notebook: 176 (49.4%)
## Ultrabook: 103 (28.9%)
## 2 in 1 Convertible: 49 (13.8%)
## Workstation: 13 (3.7%)
## Gaming: 11 (3.1%)
## Netbook: 4 (1.1%)
##
## Cluster 3 - Type Distribution:
## Notebook: 113 (61.7%)
## Gaming: 57 (31.1%)
## Workstation: 6 (3.3%)
## 2 in 1 Convertible: 5 (2.7%)
## Ultrabook: 2 (1.1%)
write.csv(data, "laptop_clustered_k3.csv", row.names = FALSE)
cat("Data dengan label cluster tersimpan: laptop_clustered_k3.csv\n")## Data dengan label cluster tersimpan: laptop_clustered_k3.csv
## === KESIMPULAN K-MEANS ===
## Dataset: Laptop Windows
## Data awal: 1100 | Setelah cleaning: 780
## Fitur: Ram, SSD, HDD, Weight, Ppi, Price
## K optimal: 3 (Elbow + Silhouette)
## Between_SS/Total_SS: 53.7%
## Silhouette Score: 0.362
##
## Segmentasi yang ditemukan:
## - Budget/Entry-Level
## - Mid-Range
## - High-End
# Gunakan fitur kontinu saja (DBSCAN sensitif terhadap biner)
dbscan_features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")
dbscan_data <- data[, dbscan_features]
cat("Dimensi data DBSCAN:", nrow(dbscan_data), "x", ncol(dbscan_data), "\n")## Dimensi data DBSCAN: 780 x 6
## Ram SSD HDD Weight
## Min. :4.000 Min. : 0 Min. : 0.0 Min. :0.810
## 1st Qu.:4.000 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:1.600
## Median :8.000 Median :128 Median : 0.0 Median :2.000
## Mean :6.597 Mean :157 Mean : 387.4 Mean :1.971
## 3rd Qu.:8.000 3rd Qu.:256 3rd Qu.:1000.0 3rd Qu.:2.208
## Max. :8.000 Max. :512 Max. :2000.0 Max. :3.520
## Ppi Price
## Min. : 90.58 Min. : 9.409
## 1st Qu.:125.37 1st Qu.:10.420
## Median :141.21 Median :10.779
## Mean :137.33 Mean :10.753
## 3rd Qu.:157.35 3rd Qu.:11.077
## Max. :200.84 Max. :12.257
## Mean setelah scaling:
## Ram SSD HDD Weight Ppi Price
## 0 0 0 0 0 0
##
## Std Dev setelah scaling:
## Ram SSD HDD Weight Ppi Price
## 1 1 1 1 1 1
## PCA Summary:
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion 0.5254 0.7333 0.8501 0.91141 0.96575 1.00000
# Ambil 4 PC (≈80–90% variance)
dbscan_pca_data <- dbscan_pca$x[, 1:4]
cat("Jumlah PC digunakan:", ncol(dbscan_pca_data), "\n")## Jumlah PC digunakan: 4
cat("Total variance explained (%):",
round(sum(summary(dbscan_pca)$importance[2, 1:4]) * 100, 2), "\n")## Total variance explained (%): 91.14
## minPts optimal: 6
## Formula: dimensi data + 2 = 4 + 2
kNNdistplot(dbscan_pca_data, k = dbscan_minPts)
abline(h = 1, col = "red", lty = 2, lwd = 2)
title(main = paste("k-NN Distance Plot (k =", dbscan_minPts, ")"),
sub = "Elbow point menunjukkan eps optimal")dbscan_eps <- 1 # sesuaikan dari plot di atas
dbscan_model <- dbscan(
dbscan_pca_data,
eps = dbscan_eps,
MinPts = dbscan_minPts
)
cat("=== HASIL DBSCAN ===\n")## === HASIL DBSCAN ===
## eps: 1
## minPts: 6
## Distribusi cluster DBSCAN:
##
## 0 1 2 3
## 23 152 495 110
##
## Cluster 0 = Noise (outlier)
dbscan_noise <- sum(dbscan_model$cluster == 0)
dbscan_noise_pct <- 100 * dbscan_noise / nrow(dbscan_pca_data)
dbscan_n_clusters <- length(unique(dbscan_model$cluster)) - 1
cat(sprintf("Noise: %d data (%.2f%%)\n", dbscan_noise, dbscan_noise_pct))## Noise: 23 data (2.95%)
## Jumlah cluster (tanpa noise): 3
cat(sprintf("Data valid (non-noise): %d (%.2f%%)\n",
nrow(dbscan_pca_data) - dbscan_noise,
100 - dbscan_noise_pct))## Data valid (non-noise): 757 (97.05%)
dbscan_idx <- which(dbscan_model$cluster != 0)
if(length(unique(dbscan_model$cluster[dbscan_idx])) > 1) {
dbscan_sil <- silhouette(
dbscan_model$cluster[dbscan_idx],
dist(dbscan_pca_data[dbscan_idx, ])
)
cat(sprintf("Average Silhouette Score (DBSCAN): %.3f\n",
mean(dbscan_sil[, 3])))
fviz_silhouette(dbscan_sil) +
labs(title = "Silhouette Plot - DBSCAN",
subtitle = "Exclude Noise Points") +
theme_minimal()
} else {
cat("Silhouette tidak dapat dihitung (cluster < 2)\n")
}## Average Silhouette Score (DBSCAN): 0.251
## cluster size ave.sil.width
## 1 1 152 0.45
## 2 2 495 0.18
## 3 3 110 0.30
fviz_cluster(
list(
data = dbscan_pca_data[, 1:2],
cluster = dbscan_model$cluster
),
geom = "point",
ellipse = TRUE,
ellipse.type = "convex",
palette = c("gray40", colors_cluster),
ggtheme = theme_minimal(),
main = ""
) +
labs(
title = "DBSCAN Clustering Result (PCA Space)",
subtitle = paste(
"eps =", dbscan_eps,
"| minPts =", dbscan_minPts,
"| Noise =", dbscan_noise, sprintf("(%.1f%%)", dbscan_noise_pct)
),
x = "PC1",
y = "PC2"
) +
theme(
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
legend.position = "bottom"
)data$DBSCAN_Cluster <- dbscan_model$cluster
write.csv(data, "laptop_windows_dbscan.csv", row.names = FALSE)
cat("File DBSCAN tersimpan: laptop_windows_dbscan.csv\n")## File DBSCAN tersimpan: laptop_windows_dbscan.csv
## Kolom DBSCAN_Cluster ditambahkan ke dataset
dbscan_summary <- data %>%
filter(DBSCAN_Cluster != 0) %>%
group_by(DBSCAN_Cluster) %>%
summarise(
n = n(),
pct = n() / sum(DBSCAN_Cluster != 0) * 100,
avg_price = mean(Price),
avg_ram = mean(Ram),
avg_ssd = mean(SSD),
avg_hdd = mean(HDD),
avg_weight = mean(Weight),
avg_ppi = mean(Ppi),
pct_touchscreen = mean(TouchScreen) * 100
)
cat("=== KARAKTERISTIK CLUSTER DBSCAN ===\n\n")## === KARAKTERISTIK CLUSTER DBSCAN ===
## # A tibble: 3 × 10
## DBSCAN_Cluster n pct avg_price avg_ram avg_ssd avg_hdd avg_weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 152 100 10.3 4 2.53 711. 2.11
## 2 2 495 100 11.0 7.94 203. 340. 1.95
## 3 3 110 100 10.5 4.22 188 0.29 1.83
## # ℹ 2 more variables: avg_ppi <dbl>, pct_touchscreen <dbl>
##
## === PERBANDINGAN K-MEANS vs DBSCAN ===
## K-Means:
## - Jumlah cluster: 3
## - Total data: 780
## - Semua data masuk cluster
## DBSCAN:
## - Jumlah cluster: 3
## - Total data: 780
## - Noise/outlier: 23 (2.95%)
## - Data valid: 757
## === KESIMPULAN DBSCAN ===
## Dataset: Laptop Windows (sama dengan K-Means)
## Data diproses: 780 laptop
## Fitur: Ram, SSD, HDD, Weight, Ppi, Price
## Parameter: eps = 1, minPts = 6
## Jumlah cluster: 3
## Noise: 23 data (2.95%)
## Silhouette Score: 0.251
##
## Kelebihan DBSCAN dibanding K-Means:
## - Tidak perlu tentukan jumlah cluster
## - Dapat deteksi outlier/noise
## - Dapat menemukan cluster bentuk arbitrary