1. Tujuan dan Alasan Klasterisasi

1.1 Import Library

library(cluster)
library(factoextra)
library(ggplot2)
library(reshape2)
library(corrplot)
library(dplyr)

1.2 Load Dataset

data <- read.csv("laptop.csv")

cat("Dimensi data awal:", nrow(data), "baris,", ncol(data), "kolom\n")
## Dimensi data awal: 1273 baris, 13 kolom

1.3 Tentukan Fokus Analisis

# Distribusi OS sebelum filter
cat("Distribusi OS:\n")
## Distribusi OS:
print(table(data$Os))
## 
##     Mac  Others Windows 
##      21     152    1100

Filter dataset hanya untuk laptop dengan OS = Windows.

# Filter hanya Windows
data <- data[data$Os == "Windows", ]
cat("\nData setelah filter Windows:", nrow(data), "laptop\n")
## 
## Data setelah filter Windows: 1100 laptop

2. Persiapan Data

2.1 Filter Dataset

# Buang data invalid (laptop tanpa storage)
invalid_idx <- which(data$SSD == 0 & data$HDD == 0)
cat("Data invalid (SSD=0 & HDD=0):", length(invalid_idx), "\n")
## Data invalid (SSD=0 & HDD=0): 46
if(length(invalid_idx) > 0) {
  data <- data[-invalid_idx, ]
}
cat("Data setelah buang invalid:", nrow(data), "laptop\n")
## Data setelah buang invalid: 1054 laptop

2.2 Buang Kolom yang Tidak Diperlukan

# Pilih hanya fitur numerik yang relevan untuk clustering
features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")
data_cluster <- data[, features]

cat("Fitur yang digunakan:", paste(features, collapse = ", "), "\n")
## Fitur yang digunakan: Ram, SSD, HDD, Weight, Ppi, Price
cat("Dimensi data clustering:", nrow(data_cluster), "x", ncol(data_cluster), "\n")
## Dimensi data clustering: 1054 x 6

2.3 Periksa Tipe Data

# Cek tipe data setiap kolom
cat("Tipe data per kolom:\n")
## Tipe data per kolom:
str(data_cluster)
## 'data.frame':    1054 obs. of  6 variables:
##  $ Ram   : int  4 16 8 4 8 8 8 8 8 4 ...
##  $ SSD   : int  0 512 256 256 256 128 128 256 256 0 ...
##  $ HDD   : int  500 0 0 0 0 0 1000 0 0 1000 ...
##  $ Weight: num  2.1 1.3 1.6 2.2 2.2 1.22 2.5 1.62 1.91 2.3 ...
##  $ Ppi   : num  100 157 157 141 141 ...
##  $ Price : num  9.97 11.29 10.62 10.19 10.59 ...
# Pastikan semua numerik
cat("\nSemua kolom numerik:", all(sapply(data_cluster, is.numeric)), "\n")
## 
## Semua kolom numerik: TRUE

2.4 Pengecekan Nilai Null

cat("Jumlah missing value per kolom:\n")
## Jumlah missing value per kolom:
print(colSums(is.na(data_cluster)))
##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0

3. Analisis Deskriptif

3.1 Ringkasan Dataset

summary(data_cluster)
##       Ram              SSD              HDD             Weight     
##  Min.   : 2.000   Min.   :   0.0   Min.   :   0.0   Min.   :0.810  
##  1st Qu.: 6.000   1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:1.565  
##  Median : 8.000   Median : 256.0   Median :   0.0   Median :2.040  
##  Mean   : 8.953   Mean   : 210.5   Mean   : 422.8   Mean   :2.077  
##  3rd Qu.: 8.000   3rd Qu.: 256.0   3rd Qu.:1000.0   3rd Qu.:2.360  
##  Max.   :64.000   Max.   :1024.0   Max.   :2000.0   Max.   :4.700  
##       Ppi             Price       
##  Min.   : 90.58   Min.   : 9.409  
##  1st Qu.:127.34   1st Qu.:10.550  
##  Median :141.21   Median :10.977  
##  Mean   :148.55   Mean   :10.944  
##  3rd Qu.:157.35   3rd Qu.:11.353  
##  Max.   :352.47   Max.   :12.691

3.2 Visualisasi Awal - Boxplot

boxplot_data <- melt(data_cluster)

ggplot(boxplot_data, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.8, outlier.color = "#E74C3C", outlier.size = 2) +
  scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
  labs(x = NULL, y = "Value", 
       title = "Boxplot - Deteksi Outlier",
       subtitle = "Sebelum IQR Cleaning") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

3.3 Visualisasi Awal - Histogram

ggplot(boxplot_data, aes(x = value, fill = variable)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
  labs(x = "Value", y = "Frequency", 
       title = "Histogram - Distribusi Fitur",
       subtitle = "Melihat sebaran data tiap variabel") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

3.4 Data Cleaning - IQR Method

cat("Data sebelum IQR cleaning:", nrow(data_cluster), "\n\n")
## Data sebelum IQR cleaning: 1054
# Hitung IQR untuk setiap kolom dalam dataset
Q1 <- apply(data_cluster, 2, quantile, probs = 0.25)
Q3 <- apply(data_cluster, 2, quantile, probs = 0.75)
IQR_val <- Q3 - Q1

# Tentukan batas atas dan batas bawah untuk setiap kolom
upper <- Q3 + 1.5 * IQR_val
lower <- Q1 - 1.5 * IQR_val

# Tampilkan batas
cat("Batas IQR:\n")
## Batas IQR:
print(data.frame(Q1 = round(Q1, 2), Q3 = round(Q3, 2), IQR = round(IQR_val, 2),
                 Lower = round(lower, 2), Upper = round(upper, 2)))
##            Q1      Q3     IQR    Lower   Upper
## Ram      6.00    8.00    2.00     3.00   11.00
## SSD      0.00  256.00  256.00  -384.00  640.00
## HDD      0.00 1000.00 1000.00 -1500.00 2500.00
## Weight   1.56    2.36    0.79     0.37    3.55
## Ppi    127.34  157.35   30.01    82.31  202.37
## Price   10.55   11.35    0.80     9.35   12.56
# Hapus outlier dari setiap kolom dalam dataset
cleaned_cluster <- data_cluster
for (i in 1:ncol(data_cluster)) {
  cleaned_cluster <- cleaned_cluster[cleaned_cluster[, i] >= lower[i] & cleaned_cluster[, i] <= upper[i], ]
}

cat("\nData setelah IQR cleaning:", nrow(cleaned_cluster), "\n")
## 
## Data setelah IQR cleaning: 780
cat("Outlier dihapus:", nrow(data_cluster) - nrow(cleaned_cluster), "\n")
## Outlier dihapus: 274
# Update data - ambil baris yang tersisa berdasarkan rownames
data <- data[rownames(cleaned_cluster), ]
data_cluster <- cleaned_cluster

3.5 Boxplot Setelah Cleaning

boxplot_clean <- melt(data_cluster)

ggplot(boxplot_clean, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = c("#27AE60", "#2ECC71", "#1ABC9C", "#16A085", "#138D75", "#0E6655")) +
  labs(x = NULL, y = "Value", 
       title = "Boxplot - Setelah IQR Cleaning",
       subtitle = "Data lebih bersih dari outlier") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

4. Normalisasi Data

4.1 Standardisasi (Z-Score)

data_scaled <- scale(data_cluster)

# Verifikasi scaling
cat("Mean setelah scaling (harus ~0):\n")
## Mean setelah scaling (harus ~0):
print(round(colMeans(data_scaled), 10))
##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0
cat("\nStd Dev setelah scaling (harus ~1):\n")
## 
## Std Dev setelah scaling (harus ~1):
print(round(apply(data_scaled, 2, sd), 10))
##    Ram    SSD    HDD Weight    Ppi  Price 
##      1      1      1      1      1      1

4.2 Cek Distribusi Setelah Normalisasi

scaled_df <- as.data.frame(data_scaled)
scaled_melt <- melt(scaled_df)

ggplot(scaled_melt, aes(x = value, fill = variable)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("#9B59B6", "#8E44AD", "#7D3C98", "#6C3483", "#5B2C6F", "#4A235A")) +
  labs(x = "Scaled Value (Z-Score)", y = "Frequency", 
       title = "Distribusi Setelah Normalisasi",
       subtitle = "Semua fitur dalam skala yang sama") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

5. Pemilihan Fitur

5.1 Analisis Korelasi Antar Fitur

cor_matrix <- cor(data_cluster)
cat("Correlation Matrix:\n")
## Correlation Matrix:
print(round(cor_matrix, 2))
##          Ram   SSD   HDD Weight   Ppi Price
## Ram     1.00  0.46 -0.10  -0.04  0.39  0.60
## SSD     0.46  1.00 -0.69  -0.38  0.53  0.58
## HDD    -0.10 -0.69  1.00   0.51 -0.40 -0.30
## Weight -0.04 -0.38  0.51   1.00 -0.55 -0.23
## Ppi     0.39  0.53 -0.40  -0.55  1.00  0.54
## Price   0.60  0.58 -0.30  -0.23  0.54  1.00
# Visualisasi korelasi dengan warna merah-putih-biru
corrplot(cor_matrix, 
         method = "color", 
         type = "upper", 
         addCoef.col = "black",
         number.cex = 0.9,
         tl.col = "black", 
         tl.srt = 45,
         tl.cex = 1.1,
         col = colorRampPalette(c("#E74C3C", "#FFFFFF", "#3498DB"))(100),
         title = "Korelasi Antar Fitur", 
         mar = c(0,0,2,0))

5.2 PCA (Principal Component Analysis)

pca_result <- prcomp(data_scaled)

# Variance explained
pca_var <- summary(pca_result)
print(pca_var)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion  0.5254 0.7333 0.8501 0.91141 0.96575 1.00000
# Scree plot
fviz_eig(pca_result, 
         addlabels = TRUE,
         barfill = "#3498DB",
         barcolor = "#2980B9",
         linecolor = "#E74C3C") +
  labs(title = "PCA - Variance Explained",
       subtitle = "Kontribusi setiap Principal Component",
       x = "Principal Component",
       y = "Percentage of Variance Explained") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

# Kontribusi fitur pada PC1 dan PC2
fviz_pca_var(pca_result, 
             col.var = "contrib",
             gradient.cols = c("#00B894", "#FDCB6E", "#E17055"),
             repel = TRUE) +
  labs(title = "PCA - Kontribusi Fitur",
       subtitle = "Hubungan fitur dengan Dim1 & Dim2") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6. K-Means Clustering

6.1 Visualisasi Jarak Antar Data

jarak <- get_dist(data_scaled)

fviz_dist(jarak, 
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
  labs(title = "Matriks Jarak Euclidean",
       subtitle = "Semakin gelap = semakin jauh jaraknya") +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.2 Eksperimen K-Means dengan Berbagai K

6.2.1 K-Means dengan K=2

set.seed(123)
k2 <- kmeans(data_scaled, centers = 2, nstart = 25)

cat("K=2 | Total within SS:", round(k2$tot.withinss, 2), "\n")
## K=2 | Total within SS: 2851.79
cat("K=2 | Between/Total SS:", round(k2$betweenss / k2$totss * 100, 2), "%\n")
## K=2 | Between/Total SS: 38.99 %

6.2.2 K-Means dengan K=3

set.seed(123)
k3 <- kmeans(data_scaled, centers = 3, nstart = 25)

cat("K=3 | Total within SS:", round(k3$tot.withinss, 2), "\n")
## K=3 | Total within SS: 2165.96
cat("K=3 | Between/Total SS:", round(k3$betweenss / k3$totss * 100, 2), "%\n")
## K=3 | Between/Total SS: 53.66 %

6.2.3 K-Means dengan K=4

set.seed(123)
k4 <- kmeans(data_scaled, centers = 4, nstart = 25)

cat("K=4 | Total within SS:", round(k4$tot.withinss, 2), "\n")
## K=4 | Total within SS: 1826.11
cat("K=4 | Between/Total SS:", round(k4$betweenss / k4$totss * 100, 2), "%\n")
## K=4 | Between/Total SS: 60.93 %

6.3 Validasi dan Pemilihan K Optimal

6.3.1 Elbow Method

fviz_nbclust(data_scaled, kmeans, method = "wss", nstart = 25) +
  geom_vline(xintercept = 3, linetype = 2, color = "#E74C3C", linewidth = 1) +
  labs(title = "Elbow Method - Menentukan K Optimal",
       subtitle = "Elbow (siku) berada di K=3",
       x = "Jumlah Cluster (K)",
       y = "Total Within Sum of Squares (WSS)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.3.2 Silhouette Score

fviz_nbclust(data_scaled, kmeans, method = "silhouette", nstart = 25) +
  labs(title = "Silhouette Method - Menentukan K Optimal",
       subtitle = "Silhouette tertinggi di K=3",
       x = "Jumlah Cluster (K)",
       y = "Average Silhouette Width") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.3.3 Perbandingan Hasil K=2, K=3, K=4

comparison <- data.frame(
  K = c(2, 3, 4),
  Within_SS = c(k2$tot.withinss, k3$tot.withinss, k4$tot.withinss),
  Between_Total = c(k2$betweenss / k2$totss, k3$betweenss / k3$totss, k4$betweenss / k4$totss)
)

cat("=== PERBANDINGAN K ===\n")
## === PERBANDINGAN K ===
print(comparison)
##   K Within_SS Between_Total
## 1 2  2851.793     0.3898603
## 2 3  2165.956     0.5365949
## 3 4  1826.111     0.6093045
cat("\nKesimpulan: K=3 dipilih berdasarkan Elbow Method dan Silhouette Score\n")
## 
## Kesimpulan: K=3 dipilih berdasarkan Elbow Method dan Silhouette Score

6.3.4 Silhouette Plot untuk K=3

sil_k3 <- silhouette(k3$cluster, jarak)

fviz_silhouette(sil_k3) +
  labs(title = "Silhouette Plot - K=3",
       subtitle = paste("Average Silhouette Width:", round(mean(sil_k3[, 3]), 3))) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))
##   cluster size ave.sil.width
## 1       1  241          0.28
## 2       2  356          0.42
## 3       3  183          0.35

6.4 Analisis Hasil Cluster Terpilih (K=3)

6.4.1 Visualisasi Cluster

data$Cluster <- k3$cluster

fviz_cluster(k3, data = data_scaled, 
             geom = "point",
             ellipse.type = "convex",
             palette = colors_cluster,
             ggtheme = theme_minimal(),
             main = "") +
  labs(title = "K-Means Clustering (K=3) - PCA Space",
       subtitle = paste("Between_SS / Total_SS =", 
                        round(k3$betweenss / k3$totss * 100, 1), "%"),
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
        legend.position = "bottom")

6.4.2 Karakteristik Cluster

cluster_summary <- data %>%
  group_by(Cluster) %>%
  summarise(
    n = n(),
    Ram = mean(Ram),
    SSD = mean(SSD),
    HDD = mean(HDD),
    Weight = mean(Weight),
    Ppi = mean(Ppi),
    Price = mean(Price)
  )

cat("=== KARAKTERISTIK CLUSTER ===\n")
## === KARAKTERISTIK CLUSTER ===
print(round(cluster_summary, 2))
## # A tibble: 3 × 8
##   Cluster     n   Ram   SSD     HDD Weight   Ppi Price
##     <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl> <dbl> <dbl>
## 1       1   241  4.18  58.3  473.     2.05  120.  10.3
## 2       2   356  7.61 270.     4.21   1.7   153.  11.0
## 3       3   183  7.81  66.4 1020.     2.38  131.  10.8

6.4.3 Visualisasi Perbandingan Karakteristik

cluster_melt <- melt(cluster_summary[, -2], id.vars = "Cluster")
cluster_melt$Cluster <- as.factor(cluster_melt$Cluster)

ggplot(cluster_melt, aes(x = variable, y = value, fill = Cluster)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  scale_fill_manual(values = c("#00B894", "#0984E3", "#E17055")) +
  labs(x = NULL, y = "Rata-rata Nilai",
       title = "Perbandingan Karakteristik Antar Cluster",
       subtitle = "Rata-rata nilai fitur per cluster") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
        legend.position = "bottom")

6.4.4 Interpretasi Bisnis

cluster_counts <- table(data$Cluster)

# Tentukan segment berdasarkan kombinasi ranking fitur kunci
rank_price <- rank(cluster_summary$Price)
rank_ram <- rank(cluster_summary$Ram)
rank_ssd <- rank(cluster_summary$SSD)
rank_ppi <- rank(cluster_summary$Ppi)

# Rata-rata ranking (semakin tinggi = semakin high-end)
avg_rank <- (rank_price + rank_ram + rank_ssd + rank_ppi) / 4

cat("=== SCORING SEGMENT ===\n")
## === SCORING SEGMENT ===
cat(sprintf("Cluster 1: Avg Rank = %.2f\n", avg_rank[1]))
## Cluster 1: Avg Rank = 1.00
cat(sprintf("Cluster 2: Avg Rank = %.2f\n", avg_rank[2]))
## Cluster 2: Avg Rank = 2.75
cat(sprintf("Cluster 3: Avg Rank = %.2f\n", avg_rank[3]))
## Cluster 3: Avg Rank = 2.25
for(i in 1:3) {
  cat(sprintf("\n=== CLUSTER %d (%d laptops, %.1f%%) ===\n", 
              i, cluster_counts[i], 100 * cluster_counts[i] / sum(cluster_counts)))
  
  cl <- cluster_summary[cluster_summary$Cluster == i, ]
  cat(sprintf("Ram: %.1f GB | SSD: %.0f GB | HDD: %.0f GB\n", cl$Ram, cl$SSD, cl$HDD))
  cat(sprintf("Weight: %.2f kg | Ppi: %.0f | Price: %.2f\n", cl$Weight, cl$Ppi, cl$Price))
  
  # Label segmen berdasarkan rata-rata ranking
  if(avg_rank[i] == max(avg_rank)) {
    cat(">> SEGMENT: HIGH-END\n")
  } else if(avg_rank[i] == min(avg_rank)) {
    cat(">> SEGMENT: BUDGET/ENTRY-LEVEL\n")
  } else {
    cat(">> SEGMENT: MID-RANGE\n")
  }
}
## 
## === CLUSTER 1 (241 laptops, 30.9%) ===
## Ram: 4.2 GB | SSD: 58 GB | HDD: 473 GB
## Weight: 2.05 kg | Ppi: 120 | Price: 10.30
## >> SEGMENT: BUDGET/ENTRY-LEVEL
## 
## === CLUSTER 2 (356 laptops, 45.6%) ===
## Ram: 7.6 GB | SSD: 270 GB | HDD: 4 GB
## Weight: 1.70 kg | Ppi: 153 | Price: 11.05
## >> SEGMENT: HIGH-END
## 
## === CLUSTER 3 (183 laptops, 23.5%) ===
## Ram: 7.8 GB | SSD: 66 GB | HDD: 1020 GB
## Weight: 2.38 kg | Ppi: 131 | Price: 10.78
## >> SEGMENT: MID-RANGE

6.4.5 Distribusi Brand per Cluster

for(i in 1:3) {
  cat(sprintf("\nCluster %d - Top 5 Brand:\n", i))
  brands <- sort(table(data[data$Cluster == i, "Company"]), decreasing = TRUE)[1:5]
  for(j in 1:length(brands)) {
    cat(sprintf("  %s: %d (%.1f%%)\n", names(brands)[j], brands[j], 
                100 * brands[j] / cluster_counts[i]))
  }
}
## 
## Cluster 1 - Top 5 Brand:
##   HP: 65 (27.0%)
##   Dell: 50 (20.7%)
##   Lenovo: 50 (20.7%)
##   Acer: 37 (15.4%)
##   Asus: 22 (9.1%)
## 
## Cluster 2 - Top 5 Brand:
##   HP: 107 (30.1%)
##   Lenovo: 86 (24.2%)
##   Dell: 78 (21.9%)
##   Toshiba: 25 (7.0%)
##   Asus: 24 (6.7%)
## 
## Cluster 3 - Top 5 Brand:
##   HP: 46 (25.1%)
##   Dell: 45 (24.6%)
##   Lenovo: 32 (17.5%)
##   Asus: 29 (15.8%)
##   MSI: 17 (9.3%)

6.4.6 Distribusi Type per Cluster

for(i in 1:3) {
  cat(sprintf("\nCluster %d - Type Distribution:\n", i))
  types <- sort(table(data[data$Cluster == i, "TypeName"]), decreasing = TRUE)
  for(j in 1:length(types)) {
    cat(sprintf("  %s: %d (%.1f%%)\n", names(types)[j], types[j], 
                100 * types[j] / cluster_counts[i]))
  }
}
## 
## Cluster 1 - Type Distribution:
##   Notebook: 220 (91.3%)
##   2 in 1 Convertible: 11 (4.6%)
##   Netbook: 5 (2.1%)
##   Ultrabook: 4 (1.7%)
##   Gaming: 1 (0.4%)
## 
## Cluster 2 - Type Distribution:
##   Notebook: 176 (49.4%)
##   Ultrabook: 103 (28.9%)
##   2 in 1 Convertible: 49 (13.8%)
##   Workstation: 13 (3.7%)
##   Gaming: 11 (3.1%)
##   Netbook: 4 (1.1%)
## 
## Cluster 3 - Type Distribution:
##   Notebook: 113 (61.7%)
##   Gaming: 57 (31.1%)
##   Workstation: 6 (3.3%)
##   2 in 1 Convertible: 5 (2.7%)
##   Ultrabook: 2 (1.1%)

6.5 Export Hasil K-Means

write.csv(data, "laptop_clustered_k3.csv", row.names = FALSE)
cat("Data dengan label cluster tersimpan: laptop_clustered_k3.csv\n")
## Data dengan label cluster tersimpan: laptop_clustered_k3.csv

6.6 Kesimpulan K-Means

cat("=== KESIMPULAN K-MEANS ===\n\n")
## === KESIMPULAN K-MEANS ===
cat("Dataset: Laptop Windows\n")
## Dataset: Laptop Windows
cat(sprintf("Data awal: 1100 | Setelah cleaning: %d\n", nrow(data)))
## Data awal: 1100 | Setelah cleaning: 780
cat(sprintf("Fitur: %s\n", paste(features, collapse = ", ")))
## Fitur: Ram, SSD, HDD, Weight, Ppi, Price
cat(sprintf("K optimal: 3 (Elbow + Silhouette)\n"))
## K optimal: 3 (Elbow + Silhouette)
cat(sprintf("Between_SS/Total_SS: %.1f%%\n", 100 * k3$betweenss / k3$totss))
## Between_SS/Total_SS: 53.7%
cat(sprintf("Silhouette Score: %.3f\n", mean(silhouette(k3$cluster, jarak)[, 3])))
## Silhouette Score: 0.362
cat("\nSegmentasi yang ditemukan:\n")
## 
## Segmentasi yang ditemukan:
cat("  - Budget/Entry-Level\n")
##   - Budget/Entry-Level
cat("  - Mid-Range\n")
##   - Mid-Range
cat("  - High-End\n")
##   - High-End

7. DBSCAN Clustering

7.1 Import Library DBSCAN

library(dbscan)
library(fpc)

7.2 Persiapan Data untuk DBSCAN

# Gunakan fitur kontinu saja (DBSCAN sensitif terhadap biner)
dbscan_features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")

dbscan_data <- data[, dbscan_features]

cat("Dimensi data DBSCAN:", nrow(dbscan_data), "x", ncol(dbscan_data), "\n")
## Dimensi data DBSCAN: 780 x 6

7.2.1 Ringkasan Data DBSCAN

summary(dbscan_data)
##       Ram             SSD           HDD             Weight     
##  Min.   :4.000   Min.   :  0   Min.   :   0.0   Min.   :0.810  
##  1st Qu.:4.000   1st Qu.:  0   1st Qu.:   0.0   1st Qu.:1.600  
##  Median :8.000   Median :128   Median :   0.0   Median :2.000  
##  Mean   :6.597   Mean   :157   Mean   : 387.4   Mean   :1.971  
##  3rd Qu.:8.000   3rd Qu.:256   3rd Qu.:1000.0   3rd Qu.:2.208  
##  Max.   :8.000   Max.   :512   Max.   :2000.0   Max.   :3.520  
##       Ppi             Price       
##  Min.   : 90.58   Min.   : 9.409  
##  1st Qu.:125.37   1st Qu.:10.420  
##  Median :141.21   Median :10.779  
##  Mean   :137.33   Mean   :10.753  
##  3rd Qu.:157.35   3rd Qu.:11.077  
##  Max.   :200.84   Max.   :12.257

7.2.2 Normalisasi Data DBSCAN

dbscan_scaled <- scale(dbscan_data)

# Cek hasil normalisasi
cat("Mean setelah scaling:\n")
## Mean setelah scaling:
round(colMeans(dbscan_scaled), 5)
##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0
cat("\nStd Dev setelah scaling:\n")
## 
## Std Dev setelah scaling:
round(apply(dbscan_scaled, 2, sd), 5)
##    Ram    SSD    HDD Weight    Ppi  Price 
##      1      1      1      1      1      1

7.2.3 PCA untuk Reduksi Dimensi

dbscan_pca <- prcomp(dbscan_scaled)

cat("PCA Summary:\n")
## PCA Summary:
summary(dbscan_pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion  0.5254 0.7333 0.8501 0.91141 0.96575 1.00000
# Ambil 4 PC (≈80–90% variance)
dbscan_pca_data <- dbscan_pca$x[, 1:4]

cat("Jumlah PC digunakan:", ncol(dbscan_pca_data), "\n")
## Jumlah PC digunakan: 4
cat("Total variance explained (%):",
    round(sum(summary(dbscan_pca)$importance[2, 1:4]) * 100, 2), "\n")
## Total variance explained (%): 91.14

7.3 Penentuan Parameter DBSCAN

7.3.1 Tentukan Parameter minPts

dbscan_minPts <- ncol(dbscan_pca_data) + 2
cat("minPts optimal:", dbscan_minPts, "\n")
## minPts optimal: 6
cat("Formula: dimensi data + 2 =", ncol(dbscan_pca_data), "+ 2\n")
## Formula: dimensi data + 2 = 4 + 2

7.3.2 Menentukan eps (k-NN Distance Plot)

kNNdistplot(dbscan_pca_data, k = dbscan_minPts)
abline(h = 1, col = "red", lty = 2, lwd = 2)
title(main = paste("k-NN Distance Plot (k =", dbscan_minPts, ")"),
      sub = "Elbow point menunjukkan eps optimal")

7.3.3 Jalankan DBSCAN

dbscan_eps <- 1 # sesuaikan dari plot di atas

dbscan_model <- dbscan(
  dbscan_pca_data,
  eps = dbscan_eps,
  MinPts = dbscan_minPts
)

cat("=== HASIL DBSCAN ===\n")
## === HASIL DBSCAN ===
cat("eps:", dbscan_eps, "\n")
## eps: 1
cat("minPts:", dbscan_minPts, "\n\n")
## minPts: 6
cat("Distribusi cluster DBSCAN:\n")
## Distribusi cluster DBSCAN:
print(table(dbscan_model$cluster))
## 
##   0   1   2   3 
##  23 152 495 110
cat("\nCluster 0 = Noise (outlier)\n")
## 
## Cluster 0 = Noise (outlier)

7.4 Validasi Cluster DBSCAN

7.4.1 Analisis Noise

dbscan_noise <- sum(dbscan_model$cluster == 0)
dbscan_noise_pct <- 100 * dbscan_noise / nrow(dbscan_pca_data)
dbscan_n_clusters <- length(unique(dbscan_model$cluster)) - 1

cat(sprintf("Noise: %d data (%.2f%%)\n", dbscan_noise, dbscan_noise_pct))
## Noise: 23 data (2.95%)
cat(sprintf("Jumlah cluster (tanpa noise): %d\n", dbscan_n_clusters))
## Jumlah cluster (tanpa noise): 3
cat(sprintf("Data valid (non-noise): %d (%.2f%%)\n", 
            nrow(dbscan_pca_data) - dbscan_noise,
            100 - dbscan_noise_pct))
## Data valid (non-noise): 757 (97.05%)

7.4.2 Silhouette Score (exclude noise)

dbscan_idx <- which(dbscan_model$cluster != 0)

if(length(unique(dbscan_model$cluster[dbscan_idx])) > 1) {
  
  dbscan_sil <- silhouette(
    dbscan_model$cluster[dbscan_idx],
    dist(dbscan_pca_data[dbscan_idx, ])
  )
  
  cat(sprintf("Average Silhouette Score (DBSCAN): %.3f\n",
              mean(dbscan_sil[, 3])))
  
  fviz_silhouette(dbscan_sil) +
    labs(title = "Silhouette Plot - DBSCAN",
         subtitle = "Exclude Noise Points") +
    theme_minimal()
  
} else {
  cat("Silhouette tidak dapat dihitung (cluster < 2)\n")
}
## Average Silhouette Score (DBSCAN): 0.251
##   cluster size ave.sil.width
## 1       1  152          0.45
## 2       2  495          0.18
## 3       3  110          0.30

7.5 Visualisasi Hasil DBSCAN

fviz_cluster(
  list(
    data = dbscan_pca_data[, 1:2],
    cluster = dbscan_model$cluster
  ),
  geom = "point",
  ellipse = TRUE,
  ellipse.type = "convex",
  palette = c("gray40", colors_cluster),
  ggtheme = theme_minimal(),
  main = ""
) +
  labs(
    title = "DBSCAN Clustering Result (PCA Space)",
    subtitle = paste(
      "eps =", dbscan_eps,
      "| minPts =", dbscan_minPts,
      "| Noise =", dbscan_noise, sprintf("(%.1f%%)", dbscan_noise_pct)
    ),
    x = "PC1",
    y = "PC2"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    legend.position = "bottom"
  )

7.6 Export dan Analisis Hasil

7.6.1 Simpan Hasil Cluster DBSCAN

data$DBSCAN_Cluster <- dbscan_model$cluster

write.csv(data, "laptop_windows_dbscan.csv", row.names = FALSE)

cat("File DBSCAN tersimpan: laptop_windows_dbscan.csv\n")
## File DBSCAN tersimpan: laptop_windows_dbscan.csv
cat("Kolom DBSCAN_Cluster ditambahkan ke dataset\n")
## Kolom DBSCAN_Cluster ditambahkan ke dataset

7.6.2 Karakteristik Cluster DBSCAN

dbscan_summary <- data %>%
  filter(DBSCAN_Cluster != 0) %>%
  group_by(DBSCAN_Cluster) %>%
  summarise(
    n = n(),
    pct = n() / sum(DBSCAN_Cluster != 0) * 100,
    avg_price = mean(Price),
    avg_ram = mean(Ram),
    avg_ssd = mean(SSD),
    avg_hdd = mean(HDD),
    avg_weight = mean(Weight),
    avg_ppi = mean(Ppi),
    pct_touchscreen = mean(TouchScreen) * 100
  )

cat("=== KARAKTERISTIK CLUSTER DBSCAN ===\n\n")
## === KARAKTERISTIK CLUSTER DBSCAN ===
print(round(dbscan_summary, 2))
## # A tibble: 3 × 10
##   DBSCAN_Cluster     n   pct avg_price avg_ram avg_ssd avg_hdd avg_weight
##            <dbl> <dbl> <dbl>     <dbl>   <dbl>   <dbl>   <dbl>      <dbl>
## 1              1   152   100      10.3    4       2.53  711.         2.11
## 2              2   495   100      11.0    7.94  203.    340.         1.95
## 3              3   110   100      10.5    4.22  188       0.29       1.83
## # ℹ 2 more variables: avg_ppi <dbl>, pct_touchscreen <dbl>

7.6.3 Perbandingan dengan K-Means

# Bandingkan jumlah cluster
cat("\n=== PERBANDINGAN K-MEANS vs DBSCAN ===\n\n")
## 
## === PERBANDINGAN K-MEANS vs DBSCAN ===
cat("K-Means:\n")
## K-Means:
cat("  - Jumlah cluster:", 3, "\n")
##   - Jumlah cluster: 3
cat("  - Total data:", nrow(data), "\n")
##   - Total data: 780
cat("  - Semua data masuk cluster\n\n")
##   - Semua data masuk cluster
cat("DBSCAN:\n")
## DBSCAN:
cat("  - Jumlah cluster:", dbscan_n_clusters, "\n")
##   - Jumlah cluster: 3
cat("  - Total data:", nrow(data), "\n")
##   - Total data: 780
cat("  - Noise/outlier:", dbscan_noise, sprintf("(%.2f%%)\n", dbscan_noise_pct))
##   - Noise/outlier: 23 (2.95%)
cat("  - Data valid:", nrow(data) - dbscan_noise, "\n")
##   - Data valid: 757

7.7 Kesimpulan DBSCAN

cat("=== KESIMPULAN DBSCAN ===\n\n")
## === KESIMPULAN DBSCAN ===
cat("Dataset: Laptop Windows (sama dengan K-Means)\n")
## Dataset: Laptop Windows (sama dengan K-Means)
cat(sprintf("Data diproses: %d laptop\n", nrow(data)))
## Data diproses: 780 laptop
cat(sprintf("Fitur: %s\n", paste(dbscan_features, collapse = ", ")))
## Fitur: Ram, SSD, HDD, Weight, Ppi, Price
cat(sprintf("Parameter: eps = %s, minPts = %d\n", dbscan_eps, dbscan_minPts))
## Parameter: eps = 1, minPts = 6
cat(sprintf("Jumlah cluster: %d\n", dbscan_n_clusters))
## Jumlah cluster: 3
cat(sprintf("Noise: %d data (%.2f%%)\n", dbscan_noise, dbscan_noise_pct))
## Noise: 23 data (2.95%)
if(exists("dbscan_sil")) {
  cat(sprintf("Silhouette Score: %.3f\n", mean(dbscan_sil[, 3])))
}
## Silhouette Score: 0.251
cat("\nKelebihan DBSCAN dibanding K-Means:\n")
## 
## Kelebihan DBSCAN dibanding K-Means:
cat("  - Tidak perlu tentukan jumlah cluster\n")
##   - Tidak perlu tentukan jumlah cluster
cat("  - Dapat deteksi outlier/noise\n")
##   - Dapat deteksi outlier/noise
cat("  - Dapat menemukan cluster bentuk arbitrary\n")
##   - Dapat menemukan cluster bentuk arbitrary