Untuk klasifikasi dan regresi, algoritma pembelajaran mesin Support Vector Machine (SVM) mencari hyperplane optimal yang memisahkan kelas data dengan margin maksimal.
# Load libraries
library(palmerpenguins) # Dataset penguin
library(e1071) # SVM
library(caret) # Machine learning toolkit
library(ggplot2) # Visualisasi
library(dplyr) # Data manipulation
library(gridExtra) # Multiple plots
library(RColorBrewer) # Color palettes
library(plotly) # Interactive plots
library(corrplot) # Correlation plot
library(knitr) # Kable tables
library(pROC) # ROC curves
library(tidyr) # Data tidying (untuk pivot functions)
# Load dataset
data(penguins)
df <- penguins
# Informasi dasar dataset
cat("Dimensi dataset:", dim(df), "\n")
## Dimensi dataset: 344 8
## Jumlah missing values: 19
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## species island bill_length_mm bill_depth_mm
## Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60
## Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
## Mean :43.92 Mean :17.15
## 3rd Qu.:48.50 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## NA's :2 NA's :2
## flipper_length_mm body_mass_g sex year
## Min. :172.0 Min. :2700 female:165 Min. :2007
## 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197.0 Median :4050 NA's : 11 Median :2008
## Mean :200.9 Mean :4202 Mean :2008
## 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
## Max. :231.0 Max. :6300 Max. :2009
## NA's :2 NA's :2
##
## Adelie Chinstrap Gentoo
## 152 68 124
# Visualisasi distribusi spesies
p1 <- ggplot(df, aes(x = species, fill = species)) +
geom_bar() +
labs(title = "Distribusi Spesies Penguin",
x = "Spesies", y = "Jumlah") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Set3")
# Korelasi antar variabel numerik
numeric_vars <- df %>% select_if(is.numeric)
correlation_matrix <- cor(numeric_vars, use = "complete.obs")
print(p1)
corrplot(correlation_matrix, method = "circle", type = "upper",
title = "Matriks Korelasi Variabel Numerik", mar = c(0,0,1,0))
# Hapus missing values
df_clean <- na.omit(df)
cat("Data setelah menghapus missing values:", nrow(df_clean), "baris\n")
## Data setelah menghapus missing values: 333 baris
# Pilih variabel untuk analisis (fokus pada pengukuran fisik)
df_model <- df_clean %>%
select(species, bill_length_mm, bill_depth_mm,
flipper_length_mm, body_mass_g)
# Standardisasi fitur numerik
df_scaled <- df_model
df_scaled[, 2:5] <- scale(df_scaled[, 2:5])
# Split data training dan testing (80:20)
set.seed(123)
train_index <- createDataPartition(df_scaled$species, p = 0.8, list = FALSE)
train_data <- df_scaled[train_index, ]
test_data <- df_scaled[-train_index, ]
cat("Data training:", nrow(train_data), "baris\n")
## Data training: 268 baris
## Data testing: 65 baris
# Distribusi kelas di data training dan testing
train_dist <- table(train_data$species)
test_dist <- table(test_data$species)
kable(rbind(train_dist, test_dist),
caption = "Distribusi Kelas dalam Data Training dan Testing")
Adelie | Chinstrap | Gentoo | |
---|---|---|---|
train_dist | 117 | 55 | 96 |
test_dist | 29 | 13 | 23 |
# Training SVM Linear
svm_linear <- svm(species ~ ., data = train_data,
kernel = "linear", cost = 1, scale = FALSE)
# Summary model
summary(svm_linear)
##
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "linear",
## cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 20
##
## ( 9 3 8 )
##
##
## Number of Classes: 3
##
## Levels:
## Adelie Chinstrap Gentoo
# Prediksi pada data testing
pred_linear <- predict(svm_linear, test_data)
# Confusion Matrix
cm_linear <- confusionMatrix(pred_linear, test_data$species)
print(cm_linear)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 29 2 0
## Chinstrap 0 11 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9692
## 95% CI : (0.8932, 0.9963)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.951
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 0.8462 1.0000
## Specificity 0.9444 1.0000 1.0000
## Pos Pred Value 0.9355 1.0000 1.0000
## Neg Pred Value 1.0000 0.9630 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4462 0.1692 0.3538
## Detection Prevalence 0.4769 0.1692 0.3538
## Balanced Accuracy 0.9722 0.9231 1.0000
# Akurasi, Precision, Recall untuk setiap kelas
accuracy_linear <- cm_linear$overall['Accuracy']
# Handle kasus dimana byClass mungkin NULL atau berbeda struktur
if (!is.null(cm_linear$byClass) && is.matrix(cm_linear$byClass)) {
precision_linear <- cm_linear$byClass[, 'Pos Pred Value']
recall_linear <- cm_linear$byClass[, 'Sensitivity']
f1_linear <- cm_linear$byClass[, 'F1']
} else {
# Untuk kasus binary classification atau struktur berbeda
precision_linear <- c(cm_linear$byClass['Pos Pred Value'])
recall_linear <- c(cm_linear$byClass['Sensitivity'])
f1_linear <- c(cm_linear$byClass['F1'])
}
# Handle NA values
precision_linear[is.na(precision_linear)] <- 0
recall_linear[is.na(recall_linear)] <- 0
f1_linear[is.na(f1_linear)] <- 0
# Tabel hasil evaluasi
eval_linear <- data.frame(
Metrics = c("Accuracy",
paste0("Precision_", names(precision_linear)),
paste0("Recall_", names(recall_linear)),
paste0("F1_", names(f1_linear))),
Values = c(accuracy_linear, precision_linear, recall_linear, f1_linear)
)
kable(eval_linear, digits = 4, caption = "Evaluasi Model SVM Linear")
Metrics | Values |
---|---|
Accuracy | 0.9692 |
Precision_Class: Adelie | 0.9355 |
Precision_Class: Chinstrap | 1.0000 |
Precision_Class: Gentoo | 1.0000 |
Recall_Class: Adelie | 1.0000 |
Recall_Class: Chinstrap | 0.8462 |
Recall_Class: Gentoo | 1.0000 |
F1_Class: Adelie | 0.9667 |
F1_Class: Chinstrap | 0.9167 |
F1_Class: Gentoo | 1.0000 |
# Training SVM dengan RBF kernel
svm_rbf <- svm(species ~ ., data = train_data,
kernel = "radial", cost = 1, gamma = 0.25, scale = FALSE)
# Summary model
summary(svm_rbf)
##
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "radial",
## cost = 1, gamma = 0.25, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 46
##
## ( 19 8 19 )
##
##
## Number of Classes: 3
##
## Levels:
## Adelie Chinstrap Gentoo
# Prediksi pada data testing
pred_rbf <- predict(svm_rbf, test_data)
# Confusion Matrix
cm_rbf <- confusionMatrix(pred_rbf, test_data$species)
print(cm_rbf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 28 3 0
## Chinstrap 1 10 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9385
## 95% CI : (0.8499, 0.983)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.902
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9655 0.7692 1.0000
## Specificity 0.9167 0.9808 1.0000
## Pos Pred Value 0.9032 0.9091 1.0000
## Neg Pred Value 0.9706 0.9444 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4308 0.1538 0.3538
## Detection Prevalence 0.4769 0.1692 0.3538
## Balanced Accuracy 0.9411 0.8750 1.0000
# Akurasi, Precision, Recall untuk setiap kelas
accuracy_rbf <- cm_rbf$overall['Accuracy']
# Handle kasus dimana byClass mungkin NULL atau berbeda struktur
if (!is.null(cm_rbf$byClass) && is.matrix(cm_rbf$byClass)) {
precision_rbf <- cm_rbf$byClass[, 'Pos Pred Value']
recall_rbf <- cm_rbf$byClass[, 'Sensitivity']
f1_rbf <- cm_rbf$byClass[, 'F1']
} else {
# Untuk kasus binary classification atau struktur berbeda
precision_rbf <- c(cm_rbf$byClass['Pos Pred Value'])
recall_rbf <- c(cm_rbf$byClass['Sensitivity'])
f1_rbf <- c(cm_rbf$byClass['F1'])
}
# Handle NA values
precision_rbf[is.na(precision_rbf)] <- 0
recall_rbf[is.na(recall_rbf)] <- 0
f1_rbf[is.na(f1_rbf)] <- 0
# Tabel hasil evaluasi
eval_rbf <- data.frame(
Metrics = c("Accuracy",
paste0("Precision_", names(precision_rbf)),
paste0("Recall_", names(recall_rbf)),
paste0("F1_", names(f1_rbf))),
Values = c(accuracy_rbf, precision_rbf, recall_rbf, f1_rbf)
)
kable(eval_rbf, digits = 4, caption = "Evaluasi Model SVM RBF")
Metrics | Values |
---|---|
Accuracy | 0.9385 |
Precision_Class: Adelie | 0.9032 |
Precision_Class: Chinstrap | 0.9091 |
Precision_Class: Gentoo | 1.0000 |
Recall_Class: Adelie | 0.9655 |
Recall_Class: Chinstrap | 0.7692 |
Recall_Class: Gentoo | 1.0000 |
F1_Class: Adelie | 0.9333 |
F1_Class: Chinstrap | 0.8333 |
F1_Class: Gentoo | 1.0000 |
# Pastikan kedua data frame memiliki struktur yang sama
min_length <- min(nrow(eval_linear), nrow(eval_rbf))
eval_linear_subset <- eval_linear[1:min_length, ]
eval_rbf_subset <- eval_rbf[1:min_length, ]
# Gabungkan hasil evaluasi
comparison <- data.frame(
Metrics = eval_linear_subset$Metrics,
SVM_Linear = eval_linear_subset$Values,
SVM_RBF = eval_rbf_subset$Values
)
kable(comparison, digits = 4,
caption = "Perbandingan Performa SVM Linear vs RBF")
Metrics | SVM_Linear | SVM_RBF |
---|---|---|
Accuracy | 0.9692 | 0.9385 |
Precision_Class: Adelie | 0.9355 | 0.9032 |
Precision_Class: Chinstrap | 1.0000 | 0.9091 |
Precision_Class: Gentoo | 1.0000 | 1.0000 |
Recall_Class: Adelie | 1.0000 | 0.9655 |
Recall_Class: Chinstrap | 0.8462 | 0.7692 |
Recall_Class: Gentoo | 1.0000 | 1.0000 |
F1_Class: Adelie | 0.9667 | 0.9333 |
F1_Class: Chinstrap | 0.9167 | 0.8333 |
F1_Class: Gentoo | 1.0000 | 1.0000 |
# Visualisasi perbandingan akurasi
accuracy_comparison <- data.frame(
Model = c("SVM Linear", "SVM RBF"),
Accuracy = c(accuracy_linear, accuracy_rbf)
)
ggplot(accuracy_comparison, aes(x = Model, y = Accuracy, fill = Model)) +
geom_col() +
geom_text(aes(label = round(Accuracy, 4)), vjust = -0.5) +
labs(title = "Perbandingan Akurasi Model SVM",
y = "Akurasi") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Pastel1") +
ylim(0, 1)
# Grid search untuk SVM RBF
tune_result <- tune(svm, species ~ ., data = train_data,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100),
gamma = c(0.01, 0.1, 1, 10)))
# Hasil tuning
summary(tune_result)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.1
##
## - best performance: 0.003703704
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.01 0.205698006 0.06341903
## 2 1.0 0.01 0.018660969 0.03607768
## 3 10.0 0.01 0.007549858 0.01592004
## 4 100.0 0.01 0.007549858 0.01592004
## 5 0.1 0.10 0.018660969 0.02630468
## 6 1.0 0.10 0.011253561 0.01812410
## 7 10.0 0.10 0.007549858 0.01592004
## 8 100.0 0.10 0.003703704 0.01171214
## 9 0.1 1.00 0.026068376 0.03518932
## 10 1.0 1.00 0.011253561 0.01812410
## 11 10.0 1.00 0.007407407 0.01561619
## 12 100.0 1.00 0.007407407 0.01561619
## 13 0.1 10.00 0.563960114 0.09653139
## 14 1.0 10.00 0.130484330 0.10051728
## 15 10.0 10.00 0.119373219 0.09525313
## 16 100.0 10.00 0.119373219 0.09525313
# Best parameters
best_params <- tune_result$best.parameters
cat("Parameter optimal - Cost:", best_params$cost, "Gamma:", best_params$gamma, "\n")
## Parameter optimal - Cost: 100 Gamma: 0.1
# Model dengan parameter optimal
svm_tuned <- tune_result$best.model
# Prediksi dengan model yang di-tune
pred_tuned <- predict(svm_tuned, test_data)
cm_tuned <- confusionMatrix(pred_tuned, test_data$species)
cat("Akurasi model tuned:", cm_tuned$overall['Accuracy'], "\n")
## Akurasi model tuned: 0.9692308
# Heatmap hasil tuning
tuning_results <- tune_result$performances
ggplot(tuning_results, aes(x = factor(cost), y = factor(gamma), fill = error)) +
geom_tile() +
scale_fill_gradient(low = "pink", high = "red") +
labs(title = "Heatmap Error Rate - Parameter Tuning",
x = "Cost", y = "Gamma", fill = "Error Rate") +
theme_minimal()
# Pilih 2 fitur untuk visualisasi 2D (bill_length_mm vs bill_depth_mm)
train_2d <- train_data %>% select(species, bill_length_mm, bill_depth_mm)
test_2d <- test_data %>% select(species, bill_length_mm, bill_depth_mm)
# Model SVM untuk 2D
svm_2d_linear <- svm(species ~ ., data = train_2d, kernel = "linear", cost = 1)
svm_2d_rbf <- svm(species ~ ., data = train_2d, kernel = "radial",
cost = best_params$cost, gamma = best_params$gamma)
# Fungsi untuk plot decision boundary
plot_decision_boundary <- function(model, data, title) {
# Create grid
x1 <- seq(min(data$bill_length_mm) - 0.5, max(data$bill_length_mm) + 0.5,
length.out = 100)
x2 <- seq(min(data$bill_depth_mm) - 0.5, max(data$bill_depth_mm) + 0.5,
length.out = 100)
grid <- expand.grid(bill_length_mm = x1, bill_depth_mm = x2)
# Predict on grid
grid$species <- predict(model, grid)
# Plot
ggplot() +
geom_point(data = grid, aes(x = bill_length_mm, y = bill_depth_mm,
color = species), alpha = 0.3, size = 0.5) +
geom_point(data = data, aes(x = bill_length_mm, y = bill_depth_mm,
color = species), size = 2) +
labs(title = title,
x = "Bill Length (mm) - Standardized",
y = "Bill Depth (mm) - Standardized") +
theme_minimal() +
scale_color_brewer(type = "qual", palette = "Set1")
}
# Plot decision boundaries
p_linear_2d <- plot_decision_boundary(svm_2d_linear, train_2d,
"SVM Linear - Decision Boundary")
p_rbf_2d <- plot_decision_boundary(svm_2d_rbf, train_2d,
"SVM RBF - Decision Boundary")
grid.arrange(p_linear_2d, p_rbf_2d, ncol = 2)
# Test berbagai nilai C dengan gamma tetap
c_values <- c(0.01, 0.1, 1, 10, 100)
c_results <- data.frame(C = c_values, Accuracy = numeric(length(c_values)))
for (i in seq_along(c_values)) {
svm_c <- svm(species ~ ., data = train_data, kernel = "radial",
cost = c_values[i], gamma = 0.25, scale = FALSE)
pred_c <- predict(svm_c, test_data)
cm_c <- confusionMatrix(pred_c, test_data$species)
c_results$Accuracy[i] <- cm_c$overall['Accuracy']
}
# Plot pengaruh parameter C
ggplot(c_results, aes(x = log10(C), y = Accuracy)) +
geom_line(color = "lightblue", size = 1) +
geom_point(color = "lightpink", size = 3) +
labs(title = "Pengaruh Parameter C terhadap Akurasi",
x = "log10(C)", y = "Akurasi") +
theme_minimal()
C | Accuracy |
---|---|
1e-02 | 0.7846 |
1e-01 | 0.9538 |
1e+00 | 0.9385 |
1e+01 | 0.9538 |
1e+02 | 0.9538 |
# Test berbagai nilai gamma dengan C tetap
gamma_values <- c(0.001, 0.01, 0.1, 1, 10)
gamma_results <- data.frame(Gamma = gamma_values,
Accuracy = numeric(length(gamma_values)))
for (i in seq_along(gamma_values)) {
svm_gamma <- svm(species ~ ., data = train_data, kernel = "radial",
cost = 1, gamma = gamma_values[i], scale = FALSE)
pred_gamma <- predict(svm_gamma, test_data)
cm_gamma <- confusionMatrix(pred_gamma, test_data$species)
gamma_results$Accuracy[i] <- cm_gamma$overall['Accuracy']
}
# Plot pengaruh parameter gamma
ggplot(gamma_results, aes(x = log10(Gamma), y = Accuracy)) +
geom_line(color = "lightgreen", size = 1) +
geom_point(color = "palevioletred1", size = 3) +
labs(title = "Pengaruh Parameter Gamma terhadap Akurasi",
x = "log10(Gamma)", y = "Akurasi") +
theme_minimal()
Gamma | Accuracy |
---|---|
1e-03 | 0.8000 |
1e-02 | 0.9538 |
1e-01 | 0.9538 |
1e+00 | 0.9385 |
1e+01 | 0.8308 |
## Jumlah Support Vectors SVM Linear: 20
## Jumlah Support Vectors SVM RBF: 46
## Jumlah Support Vectors SVM Tuned: 16
# Support vectors per kelas
sv_linear <- svm_linear$nSV
sv_rbf <- svm_rbf$nSV
sv_tuned <- svm_tuned$nSV
# Pastikan semua vector memiliki nama yang sama
species_names <- levels(train_data$species)
# Buat data frame dengan handling untuk kasus dimana tidak ada support vectors
sv_comparison <- data.frame(
Species = species_names,
SVM_Linear = ifelse(length(sv_linear) > 0, sv_linear[species_names], 0),
SVM_RBF = ifelse(length(sv_rbf) > 0, sv_rbf[species_names], 0),
SVM_Tuned = ifelse(length(sv_tuned) > 0, sv_tuned[species_names], 0)
)
# Handle missing values dengan 0
sv_comparison[is.na(sv_comparison)] <- 0
kable(sv_comparison, caption = "Jumlah Support Vectors per Kelas")
Species | SVM_Linear | SVM_RBF | SVM_Tuned |
---|---|---|---|
Adelie | 0 | 0 | 0 |
Chinstrap | 0 | 0 | 0 |
Gentoo | 0 | 0 | 0 |
# Visualisasi support vectors
sv_long <- sv_comparison %>%
tidyr::pivot_longer(cols = -Species, names_to = "Model", values_to = "Count")
ggplot(sv_long, aes(x = Species, y = Count, fill = Model)) +
geom_col(position = "dodge") +
labs(title = "Perbandingan Jumlah Support Vectors",
x = "Spesies", y = "Jumlah Support Vectors") +
theme_minimal() +
scale_fill_brewer(type = "qual", palette = "Set2")
# Koefisien untuk SVM linear (weight vector)
weights <- t(svm_linear$coefs) %*% svm_linear$SV
feature_names <- colnames(train_data)[-1]
importance_df <- data.frame(
Feature = feature_names,
Importance = abs(as.numeric(weights))
)
importance_df <- importance_df[order(importance_df$Importance, decreasing = TRUE), ]
ggplot(importance_df, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_col(fill = "lightcyan") +
coord_flip() +
labs(title = "Feature Importance - SVM Linear",
x = "Fitur", y = "Absolute Weight") +
theme_minimal()
Feature | Importance | |
---|---|---|
6 | bill_depth_mm | 4.5649 |
5 | bill_length_mm | 4.5083 |
7 | flipper_length_mm | 4.3775 |
8 | body_mass_g | 3.5974 |
2 | bill_depth_mm | 2.2925 |
4 | body_mass_g | 1.9937 |
3 | flipper_length_mm | 1.1634 |
1 | bill_length_mm | 0.3729 |
Berdasarkan analisis yang telah dilakukan menggunakan dataset Palmer Penguins maka dapat disimpulkan:
Catatan: Menurut analisis ini, SVM adalah algoritma klasifikasi yang kuat, terutama dengan penggunaan kernel yang tepat dan tuning parameter yang ideal.