Support Vector Machine (SVM) adalah algoritma pembelajaran mesin yang powerful untuk klasifikasi dan regresi. SVM bekerja dengan menemukan hyperplane optimal yang memisahkan kelas-kelas data dengan margin maksimal.
# Install packages jika belum ada
# install.packages(c("e1071", "caret", "ggplot2", "dplyr", "gridExtra", "RColorBrewer"))
# Load libraries
library(e1071) # untuk SVM
library(caret) # untuk machine learning
library(ggplot2) # untuk visualisasi
library(dplyr) # untuk manipulasi data
library(gridExtra) # untuk multiple plots
library(RColorBrewer) # untuk color palette
library(knitr) # untuk kable
library(plotly) # untuk interactive plotsDalam analisis ini, kita akan menggunakan dataset Iris yang merupakan dataset klasik dalam machine learning.
# Load dataset Iris
data(iris)
df <- iris
# Tampilkan informasi dasar dataset
cat("Dimensi dataset:", dim(df), "\n")## Dimensi dataset: 150 5
## Struktur dataset:
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
## Missing values per kolom:
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
##
## setosa versicolor virginica
## 50 50 50
# 1. Distribusi fitur numerik
p1 <- df %>%
tidyr::gather(key = "Variable", value = "Value", -Species) %>%
ggplot(aes(x = Value, fill = Species)) +
geom_histogram(alpha = 0.7, bins = 20) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Fitur Berdasarkan Spesies")
print(p1)# 2. Boxplot untuk setiap fitur
p2 <- df %>%
tidyr::gather(key = "Variable", value = "Value", -Species) %>%
ggplot(aes(x = Species, y = Value, fill = Species)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
labs(title = "Boxplot Fitur Berdasarkan Spesies") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p2)# 3. Correlation plot
library(corrplot)
cor_matrix <- cor(df[,1:4])
corrplot(cor_matrix, method = "color", type = "upper",
order = "hclust", tl.coi = "black", tl.srt = 45,
title = "Korelasi Antar Fitur")# 4. Scatter plot untuk visualisasi 2D
p3 <- ggplot(df, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = "Scatter Plot: Sepal Length vs Sepal Width",
x = "Sepal Length (cm)",
y = "Sepal Width (cm)")
p4 <- ggplot(df, aes(x = Petal.Length, y = Petal.Width, color = Species)) +
geom_point(size = 3, alpha = 0.8) +
theme_minimal() +
labs(title = "Scatter Plot: Petal Length vs Petal Width",
x = "Petal Length (cm)",
y = "Petal Width (cm)")
grid.arrange(p3, p4, ncol = 2)# Set seed untuk reproducibility
set.seed(123)
# Split data: 80% training, 20% testing
train_index <- createDataPartition(df$Species, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]
cat("Ukuran data training:", nrow(train_data), "\n")## Ukuran data training: 120
## Ukuran data testing: 30
##
## Distribusi kelas - Training:
##
## setosa versicolor virginica
## 40 40 40
##
## Distribusi kelas - Testing:
##
## setosa versicolor virginica
## 10 10 10
# Scaling fitur numerik
# Hitung mean dan sd dari training data
means <- sapply(train_data[,1:4], mean)
sds <- sapply(train_data[,1:4], sd)
# Apply scaling
train_scaled <- train_data
train_scaled[,1:4] <- scale(train_data[,1:4])
test_scaled <- test_data
test_scaled[,1:4] <- scale(test_data[,1:4], center = means, scale = sds)
# Tampilkan summary data setelah scaling
cat("Summary data training setelah scaling:\n")## Summary data training setelah scaling:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :-1.79945 Min. :-2.5697 Min. :-1.4940 Min. :-1.4246
## 1st Qu.:-0.87220 1st Qu.:-0.6134 1st Qu.:-1.2285 1st Qu.:-1.1685
## Median :-0.06085 Median :-0.1243 Median : 0.2943 Median : 0.1120
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.66356 3rd Qu.: 0.6093 3rd Qu.: 0.7414 3rd Qu.: 0.7843
## Max. : 2.37318 Max. : 2.5656 Max. : 1.7473 Max. : 1.6487
# Training SVM Linear
svm_linear <- svm(Species ~ .,
data = train_scaled,
kernel = "linear",
cost = 1,
scale = FALSE) # sudah di-scale manual
# Summary model
summary(svm_linear)##
## Call:
## svm(formula = Species ~ ., data = train_scaled, kernel = "linear",
## cost = 1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 25
##
## ( 2 13 10 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
# Prediksi pada data testing
pred_linear <- predict(svm_linear, test_scaled)
# Confusion Matrix
cm_linear <- confusionMatrix(pred_linear, test_data$Species)
print(cm_linear)## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 1
## virginica 0 0 9
##
## Overall Statistics
##
## Accuracy : 0.9667
## 95% CI : (0.8278, 0.9992)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 2.963e-13
##
## Kappa : 0.95
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9000
## Specificity 1.0000 0.9500 1.0000
## Pos Pred Value 1.0000 0.9091 1.0000
## Neg Pred Value 1.0000 1.0000 0.9524
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.3000
## Detection Prevalence 0.3333 0.3667 0.3000
## Balanced Accuracy 1.0000 0.9750 0.9500
# Ekstrak metrics
linear_accuracy <- cm_linear$overall['Accuracy']
linear_precision <- cm_linear$byClass[,'Precision']
linear_recall <- cm_linear$byClass[,'Recall']
linear_f1 <- cm_linear$byClass[,'F1']
cat("=== SVM Linear Performance ===\n")## === SVM Linear Performance ===
## Accuracy: 0.9667
## Precision per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0000 0.9091 1.0000
## Recall per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0 1.0 0.9
## F1-Score per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0000 0.9524 0.9474
# Training SVM dengan RBF kernel
svm_rbf <- svm(Species ~ .,
data = train_scaled,
kernel = "radial",
cost = 1,
gamma = 0.25,
scale = FALSE)
# Summary model
summary(svm_rbf)##
## Call:
## svm(formula = Species ~ ., data = train_scaled, kernel = "radial",
## cost = 1, gamma = 0.25, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 45
##
## ( 8 19 18 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
# Prediksi pada data testing
pred_rbf <- predict(svm_rbf, test_scaled)
# Confusion Matrix
cm_rbf <- confusionMatrix(pred_rbf, test_data$Species)
print(cm_rbf)## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 10 2
## virginica 0 0 8
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.8000
## Specificity 1.0000 0.9000 1.0000
## Pos Pred Value 1.0000 0.8333 1.0000
## Neg Pred Value 1.0000 1.0000 0.9091
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3333 0.2667
## Detection Prevalence 0.3333 0.4000 0.2667
## Balanced Accuracy 1.0000 0.9500 0.9000
# Ekstrak metrics
rbf_accuracy <- cm_rbf$overall['Accuracy']
rbf_precision <- cm_rbf$byClass[,'Precision']
rbf_recall <- cm_rbf$byClass[,'Recall']
rbf_f1 <- cm_rbf$byClass[,'F1']
cat("=== SVM RBF Performance ===\n")## === SVM RBF Performance ===
## Accuracy: 0.9333
## Precision per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0000 0.8333 1.0000
## Recall per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0 1.0 0.8
## F1-Score per class:
## Class: setosa Class: versicolor Class: virginica
## 1.0000 0.9091 0.8889
# Definisi parameter grid
tune_grid <- expand.grid(
cost = c(0.1, 1, 10, 100),
gamma = c(0.01, 0.1, 0.25, 0.5, 1)
)
# Hyperparameter tuning menggunakan cross-validation
tune_result <- tune(svm, Species ~ .,
data = train_scaled,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100),
gamma = c(0.01, 0.1, 0.25, 0.5, 1)),
tunecontrol = tune.control(cross = 5))
# Best parameters
print(tune_result)##
## Parameter tuning of 'svm':
##
## - sampling method: 5-fold cross validation
##
## - best parameters:
## cost gamma
## 100 0.01
##
## - best performance: 0.01666667
##
## Best parameters:
## Cost: 100
## Gamma: 0.01
# Training model dengan parameter terbaik
svm_best <- svm(Species ~ .,
data = train_scaled,
kernel = "radial",
cost = best_params$cost,
gamma = best_params$gamma,
scale = FALSE)
# Prediksi dengan model terbaik
pred_best <- predict(svm_best, test_scaled)
cm_best <- confusionMatrix(pred_best, test_data$Species)
cat("=== Best SVM Model Performance ===\n")## === Best SVM Model Performance ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 9 1
## virginica 0 1 9
##
## Overall Statistics
##
## Accuracy : 0.9333
## 95% CI : (0.7793, 0.9918)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : 8.747e-12
##
## Kappa : 0.9
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9000 0.9000
## Specificity 1.0000 0.9500 0.9500
## Pos Pred Value 1.0000 0.9000 0.9000
## Neg Pred Value 1.0000 0.9500 0.9500
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3000 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9250 0.9250
Untuk visualisasi decision boundary, kita akan menggunakan 2 fitur terpenting.
# Gunakan Petal.Length dan Petal.Width karena memiliki separasi yang baik
# Create subset data dengan 2 fitur
train_2d <- train_scaled[, c("Petal.Length", "Petal.Width", "Species")]
test_2d <- test_scaled[, c("Petal.Length", "Petal.Width", "Species")]
# Training SVM untuk 2D visualization
svm_2d_linear <- svm(Species ~ ., data = train_2d, kernel = "linear", cost = 1)
svm_2d_rbf <- svm(Species ~ ., data = train_2d, kernel = "radial",
cost = best_params$cost, gamma = best_params$gamma)# Fungsi untuk plot decision boundary
plot_decision_boundary <- function(model, data, title) {
# Create grid
x_min <- min(data$Petal.Length) - 0.5
x_max <- max(data$Petal.Length) + 0.5
y_min <- min(data$Petal.Width) - 0.5
y_max <- max(data$Petal.Width) + 0.5
grid_x <- seq(x_min, x_max, length.out = 100)
grid_y <- seq(y_min, y_max, length.out = 100)
grid <- expand.grid(Petal.Length = grid_x, Petal.Width = grid_y)
# Predict on grid
grid_pred <- predict(model, grid)
# Create plot
ggplot() +
geom_contour(data = data.frame(grid, pred = as.numeric(grid_pred)),
aes(x = Petal.Length, y = Petal.Width, z = pred),
color = "black", alpha = 0.3) +
geom_point(data = data,
aes(x = Petal.Length, y = Petal.Width, color = Species),
size = 3, alpha = 0.8) +
labs(title = title,
x = "Petal Length (scaled)",
y = "Petal Width (scaled)") +
theme_minimal() +
theme(legend.position = "bottom")
}# Plot decision boundaries
p_linear <- plot_decision_boundary(svm_2d_linear, train_2d, "SVM Linear - Decision Boundary")
p_rbf <- plot_decision_boundary(svm_2d_rbf, train_2d, "SVM RBF - Decision Boundary")
grid.arrange(p_linear, p_rbf, ncol = 2)# Test berbagai nilai C
c_values <- c(0.01, 0.1, 1, 10, 100)
c_results <- data.frame(
C = c_values,
Accuracy = numeric(length(c_values)),
Training_Accuracy = numeric(length(c_values))
)
for(i in 1:length(c_values)) {
# Model dengan C berbeda
model_c <- svm(Species ~ ., data = train_scaled,
kernel = "radial", cost = c_values[i], gamma = 0.25)
# Test accuracy
pred_test <- predict(model_c, test_scaled)
c_results$Accuracy[i] <- mean(pred_test == test_data$Species)
# Training accuracy
pred_train <- predict(model_c, train_scaled)
c_results$Training_Accuracy[i] <- mean(pred_train == train_data$Species)
}
print(c_results)## C Accuracy Training_Accuracy
## 1 1e-02 0.8000000 0.9000000
## 2 1e-01 0.8666667 0.9000000
## 3 1e+00 0.9333333 0.9833333
## 4 1e+01 0.9666667 0.9833333
## 5 1e+02 0.9333333 0.9916667
# Plot pengaruh parameter C
c_long <- c_results %>%
tidyr::gather(key = "Type", value = "Accuracy", -C)
ggplot(c_long, aes(x = log10(C), y = Accuracy, color = Type)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
labs(title = "Pengaruh Parameter C terhadap Akurasi",
x = "log10(C)",
y = "Akurasi",
color = "Dataset") +
theme_minimal() +
scale_x_continuous(breaks = log10(c_values), labels = c_values)# Test berbagai nilai gamma
gamma_values <- c(0.001, 0.01, 0.1, 0.25, 0.5, 1, 2)
gamma_results <- data.frame(
Gamma = gamma_values,
Accuracy = numeric(length(gamma_values)),
Training_Accuracy = numeric(length(gamma_values))
)
for(i in 1:length(gamma_values)) {
# Model dengan gamma berbeda
model_gamma <- svm(Species ~ ., data = train_scaled,
kernel = "radial", cost = 1, gamma = gamma_values[i])
# Test accuracy
pred_test <- predict(model_gamma, test_scaled)
gamma_results$Accuracy[i] <- mean(pred_test == test_data$Species)
# Training accuracy
pred_train <- predict(model_gamma, train_scaled)
gamma_results$Training_Accuracy[i] <- mean(pred_train == train_data$Species)
}
print(gamma_results)## Gamma Accuracy Training_Accuracy
## 1 0.001 0.8000000 0.8666667
## 2 0.010 0.8333333 0.9000000
## 3 0.100 0.9333333 0.9750000
## 4 0.250 0.9333333 0.9833333
## 5 0.500 0.9333333 0.9916667
## 6 1.000 0.9333333 0.9916667
## 7 2.000 0.9000000 0.9916667
# Plot pengaruh parameter gamma
gamma_long <- gamma_results %>%
tidyr::gather(key = "Type", value = "Accuracy", -Gamma)
ggplot(gamma_long, aes(x = log10(Gamma), y = Accuracy, color = Type)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
labs(title = "Pengaruh Parameter Gamma terhadap Akurasi",
x = "log10(Gamma)",
y = "Akurasi",
color = "Dataset") +
theme_minimal() +
scale_x_continuous(breaks = log10(gamma_values), labels = gamma_values)# Buat dataframe perbandingan
comparison <- data.frame(
Model = c("SVM Linear", "SVM RBF", "SVM RBF (Tuned)"),
Accuracy = c(linear_accuracy, rbf_accuracy, cm_best$overall['Accuracy']),
Precision_Avg = c(mean(linear_precision, na.rm = TRUE),
mean(rbf_precision, na.rm = TRUE),
mean(cm_best$byClass[,'Precision'], na.rm = TRUE)),
Recall_Avg = c(mean(linear_recall, na.rm = TRUE),
mean(rbf_recall, na.rm = TRUE),
mean(cm_best$byClass[,'Recall'], na.rm = TRUE)),
F1_Avg = c(mean(linear_f1, na.rm = TRUE),
mean(rbf_f1, na.rm = TRUE),
mean(cm_best$byClass[,'F1'], na.rm = TRUE))
)
# Round values
comparison[,2:5] <- round(comparison[,2:5], 4)
# Display table
kable(comparison, caption = "Perbandingan Performa Model SVM")| Model | Accuracy | Precision_Avg | Recall_Avg | F1_Avg |
|---|---|---|---|---|
| SVM Linear | 0.9667 | 0.9697 | 0.9667 | 0.9666 |
| SVM RBF | 0.9333 | 0.9444 | 0.9333 | 0.9327 |
| SVM RBF (Tuned) | 0.9333 | 0.9333 | 0.9333 | 0.9333 |
# Visualisasi perbandingan
comp_long <- comparison %>%
tidyr::gather(key = "Metric", value = "Value", -Model) %>%
filter(Metric != "F1_Avg") # Remove F1 to avoid overcrowding
ggplot(comp_long, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
labs(title = "Perbandingan Performa Model SVM",
x = "Model",
y = "Nilai Metrik",
fill = "Metrik") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ylim(0, 1)Dataset Iris memiliki karakteristik: - Fitur Petal Length dan Petal Width memberikan separasi kelas yang sangat baik - Species Setosa mudah dipisahkan dari yang lain - Versicolor dan Virginica memiliki beberapa overlap
Catatan: Analisis ini mendemonstrasikan implementasi SVM untuk klasifikasi dengan berbagai kernel dan parameter. Hasil menunjukkan bahwa SVM adalah algoritma yang powerful untuk masalah klasifikasi dengan proper tuning dan preprocessing.
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_Indonesia.utf8 LC_CTYPE=English_Indonesia.utf8
## [3] LC_MONETARY=English_Indonesia.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Indonesia.utf8
##
## time zone: Asia/Jakarta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] corrplot_0.94 plotly_4.10.4 knitr_1.50 RColorBrewer_1.1-3
## [5] gridExtra_2.3 dplyr_1.1.4 caret_6.0-94 lattice_0.22-6
## [9] ggplot2_3.5.2 e1071_1.7-14
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4032.109
## [4] farver_2.1.2 fastmap_1.2.0 lazyeval_0.2.2
## [7] pROC_1.18.5 digest_0.6.37 rpart_4.1.23
## [10] timechange_0.3.0 lifecycle_1.0.4 survival_3.6-4
## [13] magrittr_2.0.3 compiler_4.4.1 rlang_1.1.4
## [16] sass_0.4.9 tools_4.4.1 utf8_1.2.4
## [19] yaml_2.3.10 data.table_1.16.0 labeling_0.4.3
## [22] htmlwidgets_1.6.4 plyr_1.8.9 withr_3.0.1
## [25] purrr_1.0.2 nnet_7.3-19 grid_4.4.1
## [28] stats4_4.4.1 fansi_1.0.6 colorspace_2.1-1
## [31] future_1.34.0 globals_0.16.3 scales_1.3.0
## [34] iterators_1.0.14 MASS_7.3-60.2 isoband_0.2.7
## [37] cli_3.6.3 rmarkdown_2.28 generics_0.1.3
## [40] rstudioapi_0.16.0 future.apply_1.11.2 httr_1.4.7
## [43] reshape2_1.4.4 cachem_1.1.0 proxy_0.4-27
## [46] stringr_1.5.1 splines_4.4.1 parallel_4.4.1
## [49] vctrs_0.6.5 hardhat_1.4.0 Matrix_1.7-0
## [52] jsonlite_1.8.8 listenv_0.9.1 foreach_1.5.2
## [55] gower_1.0.1 jquerylib_0.1.4 tidyr_1.3.1
## [58] recipes_1.1.0 glue_1.7.0 parallelly_1.38.0
## [61] codetools_0.2-20 lubridate_1.9.3 stringi_1.8.4
## [64] gtable_0.3.5 munsell_0.5.1 tibble_3.2.1
## [67] pillar_1.9.0 htmltools_0.5.8.1 ipred_0.9-15
## [70] lava_1.8.0 R6_2.5.1 evaluate_1.0.0
## [73] bslib_0.8.0 class_7.3-22 Rcpp_1.0.13
## [76] nlme_3.1-164 prodlim_2024.06.25 xfun_0.52
## [79] pkgconfig_2.0.3 ModelMetrics_1.2.2.2