1. Tujuan dan Alasan Klasterisasi

1.1 Import Library

library(cluster)
library(factoextra)
library(ggplot2)
library(reshape2)
library(corrplot)
library(dplyr)

1.2 Load Dataset

data <- read.csv("laptop.csv")

cat("Dimensi data awal:", nrow(data), "baris,", ncol(data), "kolom\n")

## Dimensi data awal: 1273 baris, 13 kolom

1.3 Tentukan Fokus Analisis

# Distribusi OS sebelum filter
cat("Distribusi OS:\n")

## Distribusi OS:

print(table(data$Os))

## 
##     Mac  Others Windows 
##      21     152    1100

Filter dataset hanya untuk laptop dengan OS = Windows.

# Filter hanya Windows
data <- data[data$Os == "Windows", ]
cat("\nData setelah filter Windows:", nrow(data), "laptop\n")

## 
## Data setelah filter Windows: 1100 laptop

2. Persiapan Data

2.1 Filter Dataset

# Buang data invalid (laptop tanpa storage)
invalid_idx <- which(data$SSD == 0 & data$HDD == 0)
cat("Data invalid (SSD=0 & HDD=0):", length(invalid_idx), "\n")

## Data invalid (SSD=0 & HDD=0): 46

if(length(invalid_idx) > 0) {
  data <- data[-invalid_idx, ]
}
cat("Data setelah buang invalid:", nrow(data), "laptop\n")

## Data setelah buang invalid: 1054 laptop

2.2 Buang Kolom yang Tidak Diperlukan

# Pilih hanya fitur numerik yang relevan untuk clustering
features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")
data_cluster <- data[, features]

cat("Fitur yang digunakan:", paste(features, collapse = ", "), "\n")

## Fitur yang digunakan: Ram, SSD, HDD, Weight, Ppi, Price

cat("Dimensi data clustering:", nrow(data_cluster), "x", ncol(data_cluster), "\n")

## Dimensi data clustering: 1054 x 6

2.3 Periksa Tipe Data

# Cek tipe data setiap kolom
cat("Tipe data per kolom:\n")

## Tipe data per kolom:

str(data_cluster)

## 'data.frame':    1054 obs. of  6 variables:
##  $ Ram   : int  4 16 8 4 8 8 8 8 8 4 ...
##  $ SSD   : int  0 512 256 256 256 128 128 256 256 0 ...
##  $ HDD   : int  500 0 0 0 0 0 1000 0 0 1000 ...
##  $ Weight: num  2.1 1.3 1.6 2.2 2.2 1.22 2.5 1.62 1.91 2.3 ...
##  $ Ppi   : num  100 157 157 141 141 ...
##  $ Price : num  9.97 11.29 10.62 10.19 10.59 ...

# Pastikan semua numerik
cat("\nSemua kolom numerik:", all(sapply(data_cluster, is.numeric)), "\n")

## 
## Semua kolom numerik: TRUE

2.4 Pengecekan Nilai Null

cat("Jumlah missing value per kolom:\n")

## Jumlah missing value per kolom:

print(colSums(is.na(data_cluster)))

##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0

3. Analisis Deskriptif

3.1 Ringkasan Dataset

summary(data_cluster)

##       Ram              SSD              HDD             Weight     
##  Min.   : 2.000   Min.   :   0.0   Min.   :   0.0   Min.   :0.810  
##  1st Qu.: 6.000   1st Qu.:   0.0   1st Qu.:   0.0   1st Qu.:1.565  
##  Median : 8.000   Median : 256.0   Median :   0.0   Median :2.040  
##  Mean   : 8.953   Mean   : 210.5   Mean   : 422.8   Mean   :2.077  
##  3rd Qu.: 8.000   3rd Qu.: 256.0   3rd Qu.:1000.0   3rd Qu.:2.360  
##  Max.   :64.000   Max.   :1024.0   Max.   :2000.0   Max.   :4.700  
##       Ppi             Price       
##  Min.   : 90.58   Min.   : 9.409  
##  1st Qu.:127.34   1st Qu.:10.550  
##  Median :141.21   Median :10.977  
##  Mean   :148.55   Mean   :10.944  
##  3rd Qu.:157.35   3rd Qu.:11.353  
##  Max.   :352.47   Max.   :12.691

3.2 Visualisasi Awal - Boxplot

boxplot_data <- melt(data_cluster)

ggplot(boxplot_data, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.8, outlier.color = "#E74C3C", outlier.size = 2) +
  scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
  labs(x = NULL, y = "Value", 
       title = "Boxplot - Deteksi Outlier",
       subtitle = "Sebelum IQR Cleaning") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

3.3 Visualisasi Awal - Histogram

ggplot(boxplot_data, aes(x = value, fill = variable)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("#2E86AB", "#A23B72", "#F18F01", "#C73E1D", "#3B1F2B", "#1B998B")) +
  labs(x = "Value", y = "Frequency", 
       title = "Histogram - Distribusi Fitur",
       subtitle = "Melihat sebaran data tiap variabel") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

3.4 Data Cleaning - IQR Method

cat("Data sebelum IQR cleaning:", nrow(data_cluster), "\n\n")

## Data sebelum IQR cleaning: 1054

# Hitung IQR untuk setiap kolom dalam dataset
Q1 <- apply(data_cluster, 2, quantile, probs = 0.25)
Q3 <- apply(data_cluster, 2, quantile, probs = 0.75)
IQR_val <- Q3 - Q1

# Tentukan batas atas dan batas bawah untuk setiap kolom
upper <- Q3 + 1.5 * IQR_val
lower <- Q1 - 1.5 * IQR_val

# Tampilkan batas
cat("Batas IQR:\n")

## Batas IQR:

print(data.frame(Q1 = round(Q1, 2), Q3 = round(Q3, 2), IQR = round(IQR_val, 2),
                 Lower = round(lower, 2), Upper = round(upper, 2)))

##            Q1      Q3     IQR    Lower   Upper
## Ram      6.00    8.00    2.00     3.00   11.00
## SSD      0.00  256.00  256.00  -384.00  640.00
## HDD      0.00 1000.00 1000.00 -1500.00 2500.00
## Weight   1.56    2.36    0.79     0.37    3.55
## Ppi    127.34  157.35   30.01    82.31  202.37
## Price   10.55   11.35    0.80     9.35   12.56

# Hapus outlier dari setiap kolom dalam dataset
cleaned_cluster <- data_cluster
for (i in 1:ncol(data_cluster)) {
  cleaned_cluster <- cleaned_cluster[cleaned_cluster[, i] >= lower[i] & cleaned_cluster[, i] <= upper[i], ]
}

cat("\nData setelah IQR cleaning:", nrow(cleaned_cluster), "\n")

## 
## Data setelah IQR cleaning: 780

cat("Outlier dihapus:", nrow(data_cluster) - nrow(cleaned_cluster), "\n")

## Outlier dihapus: 274

# Update data - ambil baris yang tersisa berdasarkan rownames
data <- data[rownames(cleaned_cluster), ]
data_cluster <- cleaned_cluster

3.5 Boxplot Setelah Cleaning

boxplot_clean <- melt(data_cluster)

ggplot(boxplot_clean, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = c("#27AE60", "#2ECC71", "#1ABC9C", "#16A085", "#138D75", "#0E6655")) +
  labs(x = NULL, y = "Value", 
       title = "Boxplot - Setelah IQR Cleaning",
       subtitle = "Data lebih bersih dari outlier") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

4. Normalisasi Data

4.1 Standardisasi (Z-Score)

data_scaled <- scale(data_cluster)

# Verifikasi scaling
cat("Mean setelah scaling (harus ~0):\n")

## Mean setelah scaling (harus ~0):

print(round(colMeans(data_scaled), 10))

##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0

cat("\nStd Dev setelah scaling (harus ~1):\n")

## 
## Std Dev setelah scaling (harus ~1):

print(round(apply(data_scaled, 2, sd), 10))

##    Ram    SSD    HDD Weight    Ppi  Price 
##      1      1      1      1      1      1

4.2 Cek Distribusi Setelah Normalisasi

scaled_df <- as.data.frame(data_scaled)
scaled_melt <- melt(scaled_df)

ggplot(scaled_melt, aes(x = value, fill = variable)) +
  geom_histogram(bins = 25, color = "white", alpha = 0.85) +
  scale_fill_manual(values = c("#9B59B6", "#8E44AD", "#7D3C98", "#6C3483", "#5B2C6F", "#4A235A")) +
  labs(x = "Scaled Value (Z-Score)", y = "Frequency", 
       title = "Distribusi Setelah Normalisasi",
       subtitle = "Semua fitur dalam skala yang sama") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(legend.position = "none",
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

5. Pemilihan Fitur

5.1 Analisis Korelasi Antar Fitur

cor_matrix <- cor(data_cluster)
cat("Correlation Matrix:\n")

## Correlation Matrix:

print(round(cor_matrix, 2))

##          Ram   SSD   HDD Weight   Ppi Price
## Ram     1.00  0.46 -0.10  -0.04  0.39  0.60
## SSD     0.46  1.00 -0.69  -0.38  0.53  0.58
## HDD    -0.10 -0.69  1.00   0.51 -0.40 -0.30
## Weight -0.04 -0.38  0.51   1.00 -0.55 -0.23
## Ppi     0.39  0.53 -0.40  -0.55  1.00  0.54
## Price   0.60  0.58 -0.30  -0.23  0.54  1.00

# Visualisasi korelasi dengan warna merah-putih-biru
corrplot(cor_matrix, 
         method = "color", 
         type = "upper", 
         addCoef.col = "black",
         number.cex = 0.9,
         tl.col = "black", 
         tl.srt = 45,
         tl.cex = 1.1,
         col = colorRampPalette(c("#E74C3C", "#FFFFFF", "#3498DB"))(100),
         title = "Korelasi Antar Fitur", 
         mar = c(0,0,2,0))

5.2 PCA (Principal Component Analysis)

pca_result <- prcomp(data_scaled)

# Variance explained
pca_var <- summary(pca_result)
print(pca_var)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion  0.5254 0.7333 0.8501 0.91141 0.96575 1.00000

# Scree plot
fviz_eig(pca_result, 
         addlabels = TRUE,
         barfill = "#3498DB",
         barcolor = "#2980B9",
         linecolor = "#E74C3C") +
  labs(title = "PCA - Variance Explained",
       subtitle = "Kontribusi setiap Principal Component",
       x = "Principal Component",
       y = "Percentage of Variance Explained") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

# Kontribusi fitur pada PC1 dan PC2
fviz_pca_var(pca_result, 
             col.var = "contrib",
             gradient.cols = c("#00B894", "#FDCB6E", "#E17055"),
             repel = TRUE) +
  labs(title = "PCA - Kontribusi Fitur",
       subtitle = "Hubungan fitur dengan Dim1 & Dim2") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6. K-Means Clustering

6.1 Visualisasi Jarak Antar Data

jarak <- get_dist(data_scaled)

fviz_dist(jarak, 
          gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07")) +
  labs(title = "Matriks Jarak Euclidean",
       subtitle = "Semakin gelap = semakin jauh jaraknya") +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.2 Eksperimen K-Means dengan Berbagai K

6.2.1 K-Means dengan K=2

set.seed(123)
k2 <- kmeans(data_scaled, centers = 2, nstart = 25)

cat("K=2 | Total within SS:", round(k2$tot.withinss, 2), "\n")

## K=2 | Total within SS: 2851.79

cat("K=2 | Between/Total SS:", round(k2$betweenss / k2$totss * 100, 2), "%\n")

## K=2 | Between/Total SS: 38.99 %

6.2.2 K-Means dengan K=3

set.seed(123)
k3 <- kmeans(data_scaled, centers = 3, nstart = 25)

cat("K=3 | Total within SS:", round(k3$tot.withinss, 2), "\n")

## K=3 | Total within SS: 2165.96

cat("K=3 | Between/Total SS:", round(k3$betweenss / k3$totss * 100, 2), "%\n")

## K=3 | Between/Total SS: 53.66 %

6.2.3 K-Means dengan K=4

set.seed(123)
k4 <- kmeans(data_scaled, centers = 4, nstart = 25)

cat("K=4 | Total within SS:", round(k4$tot.withinss, 2), "\n")

## K=4 | Total within SS: 1826.11

cat("K=4 | Between/Total SS:", round(k4$betweenss / k4$totss * 100, 2), "%\n")

## K=4 | Between/Total SS: 60.93 %

6.3 Validasi dan Pemilihan K Optimal

6.3.1 Elbow Method

fviz_nbclust(data_scaled, kmeans, method = "wss", nstart = 25) +
  geom_vline(xintercept = 3, linetype = 2, color = "#E74C3C", linewidth = 1) +
  labs(title = "Elbow Method - Menentukan K Optimal",
       subtitle = "Elbow (siku) berada di K=3",
       x = "Jumlah Cluster (K)",
       y = "Total Within Sum of Squares (WSS)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.3.2 Silhouette Score

fviz_nbclust(data_scaled, kmeans, method = "silhouette", nstart = 25) +
  labs(title = "Silhouette Method - Menentukan K Optimal",
       subtitle = "Silhouette tertinggi di K=3",
       x = "Jumlah Cluster (K)",
       y = "Average Silhouette Width") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

6.3.3 Perbandingan Hasil K=2, K=3, K=4

comparison <- data.frame(
  K = c(2, 3, 4),
  Within_SS = c(k2$tot.withinss, k3$tot.withinss, k4$tot.withinss),
  Between_Total = c(k2$betweenss / k2$totss, k3$betweenss / k3$totss, k4$betweenss / k4$totss)
)

cat("=== PERBANDINGAN K ===\n")

## === PERBANDINGAN K ===

print(comparison)

##   K Within_SS Between_Total
## 1 2  2851.793     0.3898603
## 2 3  2165.956     0.5365949
## 3 4  1826.111     0.6093045

cat("\nKesimpulan: K=3 dipilih berdasarkan Elbow Method dan Silhouette Score\n")

## 
## Kesimpulan: K=3 dipilih berdasarkan Elbow Method dan Silhouette Score

6.3.4 Silhouette Plot untuk K=3

sil_k3 <- silhouette(k3$cluster, jarak)

fviz_silhouette(sil_k3) +
  labs(title = "Silhouette Plot - K=3",
       subtitle = paste("Average Silhouette Width:", round(mean(sil_k3[, 3]), 3))) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"))

##   cluster size ave.sil.width
## 1       1  241          0.28
## 2       2  356          0.42
## 3       3  183          0.35

6.4 Analisis Hasil Cluster Terpilih (K=3)

6.4.1 Visualisasi Cluster

data$Cluster <- k3$cluster

fviz_cluster(k3, data = data_scaled, 
             geom = "point",
             ellipse.type = "convex",
             palette = colors_cluster,
             ggtheme = theme_minimal(),
             main = "") +
  labs(title = "K-Means Clustering (K=3) - PCA Space",
       subtitle = paste("Between_SS / Total_SS =", 
                        round(k3$betweenss / k3$totss * 100, 1), "%"),
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
        legend.position = "bottom")

6.4.2 Karakteristik Cluster

cluster_summary <- data %>%
  group_by(Cluster) %>%
  summarise(
    n = n(),
    Ram = mean(Ram),
    SSD = mean(SSD),
    HDD = mean(HDD),
    Weight = mean(Weight),
    Ppi = mean(Ppi),
    Price = mean(Price)
  )

cat("=== KARAKTERISTIK CLUSTER ===\n")

## === KARAKTERISTIK CLUSTER ===

print(round(cluster_summary, 2))

## # A tibble: 3 × 8
##   Cluster     n   Ram   SSD     HDD Weight   Ppi Price
##     <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl> <dbl> <dbl>
## 1       1   241  4.18  58.3  473.     2.05  120.  10.3
## 2       2   356  7.61 270.     4.21   1.7   153.  11.0
## 3       3   183  7.81  66.4 1020.     2.38  131.  10.8

6.4.3 Visualisasi Perbandingan Karakteristik

cluster_melt <- melt(cluster_summary[, -2], id.vars = "Cluster")
cluster_melt$Cluster <- as.factor(cluster_melt$Cluster)

ggplot(cluster_melt, aes(x = variable, y = value, fill = Cluster)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.85) +
  scale_fill_manual(values = c("#00B894", "#0984E3", "#E17055")) +
  labs(x = NULL, y = "Rata-rata Nilai",
       title = "Perbandingan Karakteristik Antar Cluster",
       subtitle = "Rata-rata nilai fitur per cluster") +
  facet_wrap(~variable, scales = "free", ncol = 3) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        strip.text = element_text(face = "bold", size = 11),
        plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
        legend.position = "bottom")

6.4.4 Interpretasi Bisnis

cluster_counts <- table(data$Cluster)

# Tentukan segment berdasarkan kombinasi ranking fitur kunci
rank_price <- rank(cluster_summary$Price)
rank_ram <- rank(cluster_summary$Ram)
rank_ssd <- rank(cluster_summary$SSD)
rank_ppi <- rank(cluster_summary$Ppi)

# Rata-rata ranking (semakin tinggi = semakin high-end)
avg_rank <- (rank_price + rank_ram + rank_ssd + rank_ppi) / 4

cat("=== SCORING SEGMENT ===\n")

## === SCORING SEGMENT ===

cat(sprintf("Cluster 1: Avg Rank = %.2f\n", avg_rank[1]))

## Cluster 1: Avg Rank = 1.00

cat(sprintf("Cluster 2: Avg Rank = %.2f\n", avg_rank[2]))

## Cluster 2: Avg Rank = 2.75

cat(sprintf("Cluster 3: Avg Rank = %.2f\n", avg_rank[3]))

## Cluster 3: Avg Rank = 2.25

for(i in 1:3) {
  cat(sprintf("\n=== CLUSTER %d (%d laptops, %.1f%%) ===\n", 
              i, cluster_counts[i], 100 * cluster_counts[i] / sum(cluster_counts)))
  
  cl <- cluster_summary[cluster_summary$Cluster == i, ]
  cat(sprintf("Ram: %.1f GB | SSD: %.0f GB | HDD: %.0f GB\n", cl$Ram, cl$SSD, cl$HDD))
  cat(sprintf("Weight: %.2f kg | Ppi: %.0f | Price: %.2f\n", cl$Weight, cl$Ppi, cl$Price))
  
  # Label segmen berdasarkan rata-rata ranking
  if(avg_rank[i] == max(avg_rank)) {
    cat(">> SEGMENT: HIGH-END\n")
  } else if(avg_rank[i] == min(avg_rank)) {
    cat(">> SEGMENT: BUDGET/ENTRY-LEVEL\n")
  } else {
    cat(">> SEGMENT: MID-RANGE\n")
  }
}

## 
## === CLUSTER 1 (241 laptops, 30.9%) ===
## Ram: 4.2 GB | SSD: 58 GB | HDD: 473 GB
## Weight: 2.05 kg | Ppi: 120 | Price: 10.30
## >> SEGMENT: BUDGET/ENTRY-LEVEL
## 
## === CLUSTER 2 (356 laptops, 45.6%) ===
## Ram: 7.6 GB | SSD: 270 GB | HDD: 4 GB
## Weight: 1.70 kg | Ppi: 153 | Price: 11.05
## >> SEGMENT: HIGH-END
## 
## === CLUSTER 3 (183 laptops, 23.5%) ===
## Ram: 7.8 GB | SSD: 66 GB | HDD: 1020 GB
## Weight: 2.38 kg | Ppi: 131 | Price: 10.78
## >> SEGMENT: MID-RANGE

6.4.5 Distribusi Brand per Cluster

for(i in 1:3) {
  cat(sprintf("\nCluster %d - Top 5 Brand:\n", i))
  brands <- sort(table(data[data$Cluster == i, "Company"]), decreasing = TRUE)[1:5]
  for(j in 1:length(brands)) {
    cat(sprintf("  %s: %d (%.1f%%)\n", names(brands)[j], brands[j], 
                100 * brands[j] / cluster_counts[i]))
  }
}

## 
## Cluster 1 - Top 5 Brand:
##   HP: 65 (27.0%)
##   Dell: 50 (20.7%)
##   Lenovo: 50 (20.7%)
##   Acer: 37 (15.4%)
##   Asus: 22 (9.1%)
## 
## Cluster 2 - Top 5 Brand:
##   HP: 107 (30.1%)
##   Lenovo: 86 (24.2%)
##   Dell: 78 (21.9%)
##   Toshiba: 25 (7.0%)
##   Asus: 24 (6.7%)
## 
## Cluster 3 - Top 5 Brand:
##   HP: 46 (25.1%)
##   Dell: 45 (24.6%)
##   Lenovo: 32 (17.5%)
##   Asus: 29 (15.8%)
##   MSI: 17 (9.3%)

6.4.6 Distribusi Type per Cluster

for(i in 1:3) {
  cat(sprintf("\nCluster %d - Type Distribution:\n", i))
  types <- sort(table(data[data$Cluster == i, "TypeName"]), decreasing = TRUE)
  for(j in 1:length(types)) {
    cat(sprintf("  %s: %d (%.1f%%)\n", names(types)[j], types[j], 
                100 * types[j] / cluster_counts[i]))
  }
}

## 
## Cluster 1 - Type Distribution:
##   Notebook: 220 (91.3%)
##   2 in 1 Convertible: 11 (4.6%)
##   Netbook: 5 (2.1%)
##   Ultrabook: 4 (1.7%)
##   Gaming: 1 (0.4%)
## 
## Cluster 2 - Type Distribution:
##   Notebook: 176 (49.4%)
##   Ultrabook: 103 (28.9%)
##   2 in 1 Convertible: 49 (13.8%)
##   Workstation: 13 (3.7%)
##   Gaming: 11 (3.1%)
##   Netbook: 4 (1.1%)
## 
## Cluster 3 - Type Distribution:
##   Notebook: 113 (61.7%)
##   Gaming: 57 (31.1%)
##   Workstation: 6 (3.3%)
##   2 in 1 Convertible: 5 (2.7%)
##   Ultrabook: 2 (1.1%)

6.5 Export Hasil K-Means

write.csv(data, "laptop_clustered_k3.csv", row.names = FALSE)
cat("Data dengan label cluster tersimpan: laptop_clustered_k3.csv\n")

## Data dengan label cluster tersimpan: laptop_clustered_k3.csv

6.6 Kesimpulan K-Means

cat("=== KESIMPULAN K-MEANS ===\n\n")

## === KESIMPULAN K-MEANS ===

cat("Dataset: Laptop Windows\n")

## Dataset: Laptop Windows

cat(sprintf("Data awal: 1100 | Setelah cleaning: %d\n", nrow(data)))

## Data awal: 1100 | Setelah cleaning: 780

cat(sprintf("Fitur: %s\n", paste(features, collapse = ", ")))

## Fitur: Ram, SSD, HDD, Weight, Ppi, Price

cat(sprintf("K optimal: 3 (Elbow + Silhouette)\n"))

## K optimal: 3 (Elbow + Silhouette)

cat(sprintf("Between_SS/Total_SS: %.1f%%\n", 100 * k3$betweenss / k3$totss))

## Between_SS/Total_SS: 53.7%

cat(sprintf("Silhouette Score: %.3f\n", mean(silhouette(k3$cluster, jarak)[, 3])))

## Silhouette Score: 0.362

cat("\nSegmentasi yang ditemukan:\n")

## 
## Segmentasi yang ditemukan:

cat("  - Budget/Entry-Level\n")

##   - Budget/Entry-Level

cat("  - Mid-Range\n")

##   - Mid-Range

cat("  - High-End\n")

##   - High-End

7. DBSCAN Clustering

7.1 Import Library DBSCAN

library(dbscan)
library(fpc)

7.2 Persiapan Data untuk DBSCAN

# Gunakan fitur kontinu saja (DBSCAN sensitif terhadap biner)
dbscan_features <- c("Ram", "SSD", "HDD", "Weight", "Ppi", "Price")

dbscan_data <- data[, dbscan_features]

cat("Dimensi data DBSCAN:", nrow(dbscan_data), "x", ncol(dbscan_data), "\n")

## Dimensi data DBSCAN: 780 x 6

7.2.1 Ringkasan Data DBSCAN

summary(dbscan_data)

##       Ram             SSD           HDD             Weight     
##  Min.   :4.000   Min.   :  0   Min.   :   0.0   Min.   :0.810  
##  1st Qu.:4.000   1st Qu.:  0   1st Qu.:   0.0   1st Qu.:1.600  
##  Median :8.000   Median :128   Median :   0.0   Median :2.000  
##  Mean   :6.597   Mean   :157   Mean   : 387.4   Mean   :1.971  
##  3rd Qu.:8.000   3rd Qu.:256   3rd Qu.:1000.0   3rd Qu.:2.208  
##  Max.   :8.000   Max.   :512   Max.   :2000.0   Max.   :3.520  
##       Ppi             Price       
##  Min.   : 90.58   Min.   : 9.409  
##  1st Qu.:125.37   1st Qu.:10.420  
##  Median :141.21   Median :10.779  
##  Mean   :137.33   Mean   :10.753  
##  3rd Qu.:157.35   3rd Qu.:11.077  
##  Max.   :200.84   Max.   :12.257

7.2.2 Normalisasi Data DBSCAN

dbscan_scaled <- scale(dbscan_data)

# Cek hasil normalisasi
cat("Mean setelah scaling:\n")

## Mean setelah scaling:

round(colMeans(dbscan_scaled), 5)

##    Ram    SSD    HDD Weight    Ppi  Price 
##      0      0      0      0      0      0

cat("\nStd Dev setelah scaling:\n")

## 
## Std Dev setelah scaling:

round(apply(dbscan_scaled, 2, sd), 5)

##    Ram    SSD    HDD Weight    Ppi  Price 
##      1      1      1      1      1      1

7.2.3 PCA untuk Reduksi Dimensi

dbscan_pca <- prcomp(dbscan_scaled)

cat("PCA Summary:\n")

## PCA Summary:

summary(dbscan_pca)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6
## Standard deviation     1.7755 1.1170 0.8368 0.60670 0.57099 0.45335
## Proportion of Variance 0.5254 0.2079 0.1167 0.06135 0.05434 0.03425
## Cumulative Proportion  0.5254 0.7333 0.8501 0.91141 0.96575 1.00000

# Ambil 4 PC (≈80–90% variance)
dbscan_pca_data <- dbscan_pca$x[, 1:4]

cat("Jumlah PC digunakan:", ncol(dbscan_pca_data), "\n")

## Jumlah PC digunakan: 4

cat("Total variance explained (%):",
    round(sum(summary(dbscan_pca)$importance[2, 1:4]) * 100, 2), "\n")

## Total variance explained (%): 91.14

7.3 Penentuan Parameter DBSCAN

7.3.1 Tentukan Parameter minPts

dbscan_minPts <- ncol(dbscan_pca_data) + 2
cat("minPts optimal:", dbscan_minPts, "\n")

## minPts optimal: 6

cat("Formula: dimensi data + 2 =", ncol(dbscan_pca_data), "+ 2\n")

## Formula: dimensi data + 2 = 4 + 2

7.3.2 Menentukan eps (k-NN Distance Plot)

kNNdistplot(dbscan_pca_data, k = dbscan_minPts)
abline(h = 1, col = "red", lty = 2, lwd = 2)
title(main = paste("k-NN Distance Plot (k =", dbscan_minPts, ")"),
      sub = "Elbow point menunjukkan eps optimal")

7.3.3 Jalankan DBSCAN

dbscan_eps <- 1 # sesuaikan dari plot di atas

dbscan_model <- dbscan(
  dbscan_pca_data,
  eps = dbscan_eps,
  MinPts = dbscan_minPts
)

cat("=== HASIL DBSCAN ===\n")

## === HASIL DBSCAN ===

cat("eps:", dbscan_eps, "\n")

## eps: 1

cat("minPts:", dbscan_minPts, "\n\n")

## minPts: 6

cat("Distribusi cluster DBSCAN:\n")

## Distribusi cluster DBSCAN:

print(table(dbscan_model$cluster))

## 
##   0   1   2   3 
##  23 152 495 110

cat("\nCluster 0 = Noise (outlier)\n")

## 
## Cluster 0 = Noise (outlier)

7.4 Validasi Cluster DBSCAN

7.4.1 Analisis Noise

dbscan_noise <- sum(dbscan_model$cluster == 0)
dbscan_noise_pct <- 100 * dbscan_noise / nrow(dbscan_pca_data)
dbscan_n_clusters <- length(unique(dbscan_model$cluster)) - 1

cat(sprintf("Noise: %d data (%.2f%%)\n", dbscan_noise, dbscan_noise_pct))

## Noise: 23 data (2.95%)

cat(sprintf("Jumlah cluster (tanpa noise): %d\n", dbscan_n_clusters))

## Jumlah cluster (tanpa noise): 3

cat(sprintf("Data valid (non-noise): %d (%.2f%%)\n", 
            nrow(dbscan_pca_data) - dbscan_noise,
            100 - dbscan_noise_pct))

## Data valid (non-noise): 757 (97.05%)

7.4.2 Silhouette Score (exclude noise)

dbscan_idx <- which(dbscan_model$cluster != 0)

if(length(unique(dbscan_model$cluster[dbscan_idx])) > 1) {
  
  dbscan_sil <- silhouette(
    dbscan_model$cluster[dbscan_idx],
    dist(dbscan_pca_data[dbscan_idx, ])
  )
  
  cat(sprintf("Average Silhouette Score (DBSCAN): %.3f\n",
              mean(dbscan_sil[, 3])))
  
  fviz_silhouette(dbscan_sil) +
    labs(title = "Silhouette Plot - DBSCAN",
         subtitle = "Exclude Noise Points") +
    theme_minimal()
  
} else {
  cat("Silhouette tidak dapat dihitung (cluster < 2)\n")
}

## Average Silhouette Score (DBSCAN): 0.251
##   cluster size ave.sil.width
## 1       1  152          0.45
## 2       2  495          0.18
## 3       3  110          0.30

7.5 Visualisasi Hasil DBSCAN

fviz_cluster(
  list(
    data = dbscan_pca_data[, 1:2],
    cluster = dbscan_model$cluster
  ),
  geom = "point",
  ellipse = TRUE,
  ellipse.type = "convex",
  palette = c("gray40", colors_cluster),
  ggtheme = theme_minimal(),
  main = ""
) +
  labs(
    title = "DBSCAN Clustering Result (PCA Space)",
    subtitle = paste(
      "eps =", dbscan_eps,
      "| minPts =", dbscan_minPts,
      "| Noise =", dbscan_noise, sprintf("(%.1f%%)", dbscan_noise_pct)
    ),
    x = "PC1",
    y = "PC2"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5, color = "gray40"),
    legend.position = "bottom"
  )

7.6 Export dan Analisis Hasil

7.6.1 Simpan Hasil Cluster DBSCAN

data$DBSCAN_Cluster <- dbscan_model$cluster

write.csv(data, "laptop_windows_dbscan.csv", row.names = FALSE)

cat("File DBSCAN tersimpan: laptop_windows_dbscan.csv\n")

## File DBSCAN tersimpan: laptop_windows_dbscan.csv

cat("Kolom DBSCAN_Cluster ditambahkan ke dataset\n")

## Kolom DBSCAN_Cluster ditambahkan ke dataset

7.6.2 Karakteristik Cluster DBSCAN

dbscan_summary <- data %>%
  filter(DBSCAN_Cluster != 0) %>%
  group_by(DBSCAN_Cluster) %>%
  summarise(
    n = n(),
    pct = n() / sum(DBSCAN_Cluster != 0) * 100,
    avg_price = mean(Price),
    avg_ram = mean(Ram),
    avg_ssd = mean(SSD),
    avg_hdd = mean(HDD),
    avg_weight = mean(Weight),
    avg_ppi = mean(Ppi),
    pct_touchscreen = mean(TouchScreen) * 100
  )

cat("=== KARAKTERISTIK CLUSTER DBSCAN ===\n\n")

## === KARAKTERISTIK CLUSTER DBSCAN ===

print(round(dbscan_summary, 2))

## # A tibble: 3 × 10
##   DBSCAN_Cluster     n   pct avg_price avg_ram avg_ssd avg_hdd avg_weight
##            <dbl> <dbl> <dbl>     <dbl>   <dbl>   <dbl>   <dbl>      <dbl>
## 1              1   152   100      10.3    4       2.53  711.         2.11
## 2              2   495   100      11.0    7.94  203.    340.         1.95
## 3              3   110   100      10.5    4.22  188       0.29       1.83
## # ℹ 2 more variables: avg_ppi <dbl>, pct_touchscreen <dbl>

7.6.3 Perbandingan dengan K-Means

# Bandingkan jumlah cluster
cat("\n=== PERBANDINGAN K-MEANS vs DBSCAN ===\n\n")

## 
## === PERBANDINGAN K-MEANS vs DBSCAN ===

cat("K-Means:\n")

## K-Means:

cat("  - Jumlah cluster:", 3, "\n")

##   - Jumlah cluster: 3

cat("  - Total data:", nrow(data), "\n")

##   - Total data: 780

cat("  - Semua data masuk cluster\n\n")

##   - Semua data masuk cluster

cat("DBSCAN:\n")

## DBSCAN:

cat("  - Jumlah cluster:", dbscan_n_clusters, "\n")

##   - Jumlah cluster: 3

cat("  - Total data:", nrow(data), "\n")

##   - Total data: 780

cat("  - Noise/outlier:", dbscan_noise, sprintf("(%.2f%%)\n", dbscan_noise_pct))

##   - Noise/outlier: 23 (2.95%)

cat("  - Data valid:", nrow(data) - dbscan_noise, "\n")

##   - Data valid: 757

7.7 Kesimpulan DBSCAN

cat("=== KESIMPULAN DBSCAN ===\n\n")

## === KESIMPULAN DBSCAN ===

cat("Dataset: Laptop Windows (sama dengan K-Means)\n")

## Dataset: Laptop Windows (sama dengan K-Means)

cat(sprintf("Data diproses: %d laptop\n", nrow(data)))

## Data diproses: 780 laptop

cat(sprintf("Fitur: %s\n", paste(dbscan_features, collapse = ", ")))

## Fitur: Ram, SSD, HDD, Weight, Ppi, Price

cat(sprintf("Parameter: eps = %s, minPts = %d\n", dbscan_eps, dbscan_minPts))

## Parameter: eps = 1, minPts = 6

cat(sprintf("Jumlah cluster: %d\n", dbscan_n_clusters))

## Jumlah cluster: 3

cat(sprintf("Noise: %d data (%.2f%%)\n", dbscan_noise, dbscan_noise_pct))

## Noise: 23 data (2.95%)

if(exists("dbscan_sil")) {
  cat(sprintf("Silhouette Score: %.3f\n", mean(dbscan_sil[, 3])))
}

## Silhouette Score: 0.251

cat("\nKelebihan DBSCAN dibanding K-Means:\n")

## 
## Kelebihan DBSCAN dibanding K-Means:

cat("  - Tidak perlu tentukan jumlah cluster\n")

##   - Tidak perlu tentukan jumlah cluster

cat("  - Dapat deteksi outlier/noise\n")

##   - Dapat deteksi outlier/noise

cat("  - Dapat menemukan cluster bentuk arbitrary\n")

##   - Dapat menemukan cluster bentuk arbitrary

Analisis Clustering Laptop Windows

Perbandingan K-Means dan DBSCAN untuk Segmentasi Laptop

Bfq

2025-12-17

1. Tujuan dan Alasan Klasterisasi

1.1 Import Library

1.2 Load Dataset

1.3 Tentukan Fokus Analisis

2. Persiapan Data

2.1 Filter Dataset

2.2 Buang Kolom yang Tidak Diperlukan

2.3 Periksa Tipe Data

2.4 Pengecekan Nilai Null

3. Analisis Deskriptif

3.1 Ringkasan Dataset

3.2 Visualisasi Awal - Boxplot

3.3 Visualisasi Awal - Histogram

3.4 Data Cleaning - IQR Method

3.5 Boxplot Setelah Cleaning

4. Normalisasi Data

4.1 Standardisasi (Z-Score)

4.2 Cek Distribusi Setelah Normalisasi

5. Pemilihan Fitur

5.1 Analisis Korelasi Antar Fitur

5.2 PCA (Principal Component Analysis)

6. K-Means Clustering

6.1 Visualisasi Jarak Antar Data

6.2 Eksperimen K-Means dengan Berbagai K

6.2.1 K-Means dengan K=2

6.2.2 K-Means dengan K=3

6.2.3 K-Means dengan K=4

6.3 Validasi dan Pemilihan K Optimal

6.3.1 Elbow Method

6.3.2 Silhouette Score

6.3.3 Perbandingan Hasil K=2, K=3, K=4

6.3.4 Silhouette Plot untuk K=3

6.4 Analisis Hasil Cluster Terpilih (K=3)

6.4.1 Visualisasi Cluster

6.4.2 Karakteristik Cluster

6.4.3 Visualisasi Perbandingan Karakteristik

6.4.4 Interpretasi Bisnis

6.4.5 Distribusi Brand per Cluster

6.4.6 Distribusi Type per Cluster

6.5 Export Hasil K-Means

6.6 Kesimpulan K-Means

7. DBSCAN Clustering

7.1 Import Library DBSCAN

7.2 Persiapan Data untuk DBSCAN

7.2.1 Ringkasan Data DBSCAN

7.2.2 Normalisasi Data DBSCAN

7.2.3 PCA untuk Reduksi Dimensi

7.3 Penentuan Parameter DBSCAN

7.3.1 Tentukan Parameter minPts

7.3.2 Menentukan eps (k-NN Distance Plot)

7.3.3 Jalankan DBSCAN

7.4 Validasi Cluster DBSCAN

7.4.1 Analisis Noise

7.4.2 Silhouette Score (exclude noise)

7.5 Visualisasi Hasil DBSCAN

7.6 Export dan Analisis Hasil

7.6.1 Simpan Hasil Cluster DBSCAN

7.6.2 Karakteristik Cluster DBSCAN

7.6.3 Perbandingan dengan K-Means

7.7 Kesimpulan DBSCAN