Support Vector Machine (SVM) adalah salah satu algoritma machine learning yang paling powerful untuk klasifikasi dan regresi. Pada tugas ini, kami akan menggunakan dataset penguin untuk membandingkan performa SVM linear dan nonlinear dalam mengklasifikasikan spesies penguin berdasarkan karakteristik fisik mereka.
# Load required libraries
library(palmerpenguins)
library(tidyverse)
library(e1071)
library(caret)
library(gridExtra)
library(viridis)
library(corrplot)
library(plotly)
library(knitr)
library(kableExtra)
# Set seed for reproducibility
set.seed(123)# Load penguin dataset
data(penguins)
# Display basic information about the dataset
cat("Dataset Dimensions:", dim(penguins), "\n")## Dataset Dimensions: 344 8
## Number of missing values: 19
# Show first few rows
head(penguins) %>%
kable(caption = "Dataset Penguin - 6 Baris Pertama") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year |
|---|---|---|---|---|---|---|---|
| Adelie | Torgersen | 39.1 | 18.7 | 181 | 3750 | male | 2007 |
| Adelie | Torgersen | 39.5 | 17.4 | 186 | 3800 | female | 2007 |
| Adelie | Torgersen | 40.3 | 18.0 | 195 | 3250 | female | 2007 |
| Adelie | Torgersen | NA | NA | NA | NA | NA | 2007 |
| Adelie | Torgersen | 36.7 | 19.3 | 193 | 3450 | female | 2007 |
| Adelie | Torgersen | 39.3 | 20.6 | 190 | 3650 | male | 2007 |
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
# Summary statistics
summary(penguins) %>%
kable(caption = "Statistik Deskriptif Dataset") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | year | |
|---|---|---|---|---|---|---|---|---|
| Adelie :152 | Biscoe :168 | Min. :32.10 | Min. :13.10 | Min. :172.0 | Min. :2700 | female:165 | Min. :2007 | |
| Chinstrap: 68 | Dream :124 | 1st Qu.:39.23 | 1st Qu.:15.60 | 1st Qu.:190.0 | 1st Qu.:3550 | male :168 | 1st Qu.:2007 | |
| Gentoo :124 | Torgersen: 52 | Median :44.45 | Median :17.30 | Median :197.0 | Median :4050 | NA’s : 11 | Median :2008 | |
| NA | NA | Mean :43.92 | Mean :17.15 | Mean :200.9 | Mean :4202 | NA | Mean :2008 | |
| NA | NA | 3rd Qu.:48.50 | 3rd Qu.:18.70 | 3rd Qu.:213.0 | 3rd Qu.:4750 | NA | 3rd Qu.:2009 | |
| NA | NA | Max. :59.60 | Max. :21.50 | Max. :231.0 | Max. :6300 | NA | Max. :2009 | |
| NA | NA | NA’s :2 | NA’s :2 | NA’s :2 | NA’s :2 | NA | NA |
# Check missing values pattern
missing_pattern <- penguins %>%
summarise_all(~sum(is.na(.))) %>%
gather(key = "Variable", value = "Missing_Count") %>%
mutate(Missing_Percentage = round(Missing_Count/nrow(penguins)*100, 2))
missing_pattern %>%
kable(caption = "Pola Missing Values") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Variable | Missing_Count | Missing_Percentage |
|---|---|---|
| species | 0 | 0.00 |
| island | 0 | 0.00 |
| bill_length_mm | 2 | 0.58 |
| bill_depth_mm | 2 | 0.58 |
| flipper_length_mm | 2 | 0.58 |
| body_mass_g | 2 | 0.58 |
| sex | 11 | 3.20 |
| year | 0 | 0.00 |
# Remove rows with missing values
penguins_clean <- penguins %>%
drop_na()
cat("Data setelah pembersihan:", nrow(penguins_clean), "observasi")## Data setelah pembersihan: 333 observasi
# Species distribution
species_count <- penguins_clean %>%
count(species) %>%
mutate(percentage = round(n/sum(n)*100, 1))
species_count %>%
kable(caption = "Distribusi Spesies",
col.names = c("Species", "Count", "Percentage (%)")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Species | Count | Percentage (%) |
|---|---|---|
| Adelie | 146 | 43.8 |
| Chinstrap | 68 | 20.4 |
| Gentoo | 119 | 35.7 |
# Visualization
p1 <- ggplot(penguins_clean, aes(x = species, fill = species)) +
geom_bar(alpha = 0.8) +
scale_fill_viridis_d() +
theme_minimal() +
labs(title = "Distribusi Spesies Penguin",
x = "Species", y = "Count") +
theme(legend.position = "none")
print(p1)# Select numeric variables
numeric_vars <- penguins_clean %>%
select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
# Correlation matrix
cor_matrix <- cor(numeric_vars)
# Visualization
corrplot(cor_matrix, method = "color", type = "upper",
order = "hclust", tl.cex = 0.8, tl.col = "black",
title = "Matriks Korelasi Variabel Numerik",
mar = c(0,0,1,0))# Create distribution plots for all numeric variables
p_bill_length <- ggplot(penguins_clean, aes(x = bill_length_mm, fill = species)) +
geom_density(alpha = 0.7) +
scale_fill_viridis_d() +
theme_minimal() +
labs(title = "Bill Length Distribution")
p_bill_depth <- ggplot(penguins_clean, aes(x = bill_depth_mm, fill = species)) +
geom_density(alpha = 0.7) +
scale_fill_viridis_d() +
theme_minimal() +
labs(title = "Bill Depth Distribution")
p_flipper <- ggplot(penguins_clean, aes(x = flipper_length_mm, fill = species)) +
geom_density(alpha = 0.7) +
scale_fill_viridis_d() +
theme_minimal() +
labs(title = "Flipper Length Distribution")
p_mass <- ggplot(penguins_clean, aes(x = body_mass_g, fill = species)) +
geom_density(alpha = 0.7) +
scale_fill_viridis_d() +
theme_minimal() +
labs(title = "Body Mass Distribution")
# Combine plots
grid.arrange(p_bill_length, p_bill_depth, p_flipper, p_mass, ncol = 2)# Create scatter plot for key variables
p_scatter <- ggplot(penguins_clean, aes(x = bill_length_mm, y = bill_depth_mm,
color = species, size = body_mass_g)) +
geom_point(alpha = 0.7) +
scale_color_viridis_d() +
theme_minimal() +
labs(title = "Relationship between Bill Dimensions",
x = "Bill Length (mm)", y = "Bill Depth (mm)",
size = "Body Mass (g)", color = "Species")
print(p_scatter)# Prepare data for modeling
# Select relevant features and target
modeling_data <- penguins_clean %>%
select(species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
# Check for any remaining issues
summary(modeling_data)## species bill_length_mm bill_depth_mm flipper_length_mm
## Adelie :146 Min. :32.10 Min. :13.10 Min. :172
## Chinstrap: 68 1st Qu.:39.50 1st Qu.:15.60 1st Qu.:190
## Gentoo :119 Median :44.50 Median :17.30 Median :197
## Mean :43.99 Mean :17.16 Mean :201
## 3rd Qu.:48.60 3rd Qu.:18.70 3rd Qu.:213
## Max. :59.60 Max. :21.50 Max. :231
## body_mass_g
## Min. :2700
## 1st Qu.:3550
## Median :4050
## Mean :4207
## 3rd Qu.:4775
## Max. :6300
# Create train-test split
train_index <- createDataPartition(modeling_data$species, p = 0.8, list = FALSE)
train_data <- modeling_data[train_index, ]
test_data <- modeling_data[-train_index, ]
cat("Training set size:", nrow(train_data), "\n")## Training set size: 268
## Test set size: 65
# Display class distribution in train and test sets
train_dist <- table(train_data$species)
test_dist <- table(test_data$species)
distribution_table <- data.frame(
Species = names(train_dist),
Train = as.numeric(train_dist),
Test = as.numeric(test_dist),
Train_Prop = round(prop.table(train_dist), 3),
Test_Prop = round(prop.table(test_dist), 3)
)
distribution_table %>%
kable(caption = "Distribusi Class pada Train dan Test Set") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Species | Train | Test | Train_Prop.Var1 | Train_Prop.Freq | Test_Prop.Var1 | Test_Prop.Freq |
|---|---|---|---|---|---|---|
| Adelie | 117 | 29 | Adelie | 0.437 | Adelie | 0.446 |
| Chinstrap | 55 | 13 | Chinstrap | 0.205 | Chinstrap | 0.200 |
| Gentoo | 96 | 23 | Gentoo | 0.358 | Gentoo | 0.354 |
# Train linear SVM
svm_linear <- svm(species ~ .,
data = train_data,
kernel = "linear",
cost = 1,
scale = TRUE)
# Model summary
summary(svm_linear)##
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "linear",
## cost = 1, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 20
##
## ( 9 3 8 )
##
##
## Number of Classes: 3
##
## Levels:
## Adelie Chinstrap Gentoo
# Predictions
pred_linear_train <- predict(svm_linear, train_data)
pred_linear_test <- predict(svm_linear, test_data)
# Confusion matrices
cm_linear_train <- confusionMatrix(pred_linear_train, train_data$species)
cm_linear_test <- confusionMatrix(pred_linear_test, test_data$species)
# Display results
cat("=== LINEAR SVM PERFORMANCE ===\n")## === LINEAR SVM PERFORMANCE ===
## Training Accuracy: 0.9925
## Test Accuracy: 0.9692
## [1] "Test Set Detailed Metrics:"
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 29 2 0
## Chinstrap 0 11 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9692
## 95% CI : (0.8932, 0.9963)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.951
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 0.8462 1.0000
## Specificity 0.9444 1.0000 1.0000
## Pos Pred Value 0.9355 1.0000 1.0000
## Neg Pred Value 1.0000 0.9630 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4462 0.1692 0.3538
## Detection Prevalence 0.4769 0.1692 0.3538
## Balanced Accuracy 0.9722 0.9231 1.0000
# Hyperparameter tuning for linear SVM
tune_linear <- tune(svm, species ~ .,
data = train_data,
kernel = "linear",
ranges = list(cost = c(0.01, 0.1, 1, 10, 100)),
scale = TRUE)
# Best parameters
cat("Best Cost parameter for Linear SVM:", tune_linear$best.parameters$cost, "\n")## Best Cost parameter for Linear SVM: 10
## Best CV Accuracy: 1
# Train final linear SVM with best parameters
svm_linear_final <- svm(species ~ .,
data = train_data,
kernel = "linear",
cost = tune_linear$best.parameters$cost,
scale = TRUE)
# Final predictions
pred_linear_final <- predict(svm_linear_final, test_data)
cm_linear_final <- confusionMatrix(pred_linear_final, test_data$species)
cat("=== FINAL LINEAR SVM PERFORMANCE ===\n")## === FINAL LINEAR SVM PERFORMANCE ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 28 2 0
## Chinstrap 1 11 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9538
## 95% CI : (0.871, 0.9904)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.927
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9655 0.8462 1.0000
## Specificity 0.9444 0.9808 1.0000
## Pos Pred Value 0.9333 0.9167 1.0000
## Neg Pred Value 0.9714 0.9623 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4308 0.1692 0.3538
## Detection Prevalence 0.4615 0.1846 0.3538
## Balanced Accuracy 0.9550 0.9135 1.0000
# Train RBF SVM
svm_rbf <- svm(species ~ .,
data = train_data,
kernel = "radial",
cost = 1,
gamma = 1,
scale = TRUE)
# Model summary
summary(svm_rbf)##
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "radial",
## cost = 1, gamma = 1, scale = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 71
##
## ( 29 22 20 )
##
##
## Number of Classes: 3
##
## Levels:
## Adelie Chinstrap Gentoo
# Hyperparameter tuning for RBF SVM
tune_rbf <- tune(svm, species ~ .,
data = train_data,
kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100),
gamma = c(0.01, 0.1, 1, 10)),
scale = TRUE)
# Best parameters
cat("Best parameters for RBF SVM:\n")## Best parameters for RBF SVM:
## Cost: 100
## Gamma: 0.1
## Best CV Accuracy: 1
# Train final RBF SVM with best parameters
svm_rbf_final <- svm(species ~ .,
data = train_data,
kernel = "radial",
cost = tune_rbf$best.parameters$cost,
gamma = tune_rbf$best.parameters$gamma,
scale = TRUE)
# Final predictions
pred_rbf_final <- predict(svm_rbf_final, test_data)
cm_rbf_final <- confusionMatrix(pred_rbf_final, test_data$species)
cat("=== FINAL RBF SVM PERFORMANCE ===\n")## === FINAL RBF SVM PERFORMANCE ===
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 28 1 0
## Chinstrap 1 12 0
## Gentoo 0 0 23
##
## Overall Statistics
##
## Accuracy : 0.9692
## 95% CI : (0.8932, 0.9963)
## No Information Rate : 0.4462
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9516
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9655 0.9231 1.0000
## Specificity 0.9722 0.9808 1.0000
## Pos Pred Value 0.9655 0.9231 1.0000
## Neg Pred Value 0.9722 0.9808 1.0000
## Prevalence 0.4462 0.2000 0.3538
## Detection Rate 0.4308 0.1846 0.3538
## Detection Prevalence 0.4462 0.2000 0.3538
## Balanced Accuracy 0.9689 0.9519 1.0000
# Create comparison table
comparison_data <- data.frame(
Model = c("Linear SVM", "RBF SVM"),
Accuracy = c(cm_linear_final$overall['Accuracy'],
cm_rbf_final$overall['Accuracy']),
Precision = c(mean(cm_linear_final$byClass[,'Precision'], na.rm = TRUE),
mean(cm_rbf_final$byClass[,'Precision'], na.rm = TRUE)),
Recall = c(mean(cm_linear_final$byClass[,'Recall'], na.rm = TRUE),
mean(cm_rbf_final$byClass[,'Recall'], na.rm = TRUE)),
F1_Score = c(mean(cm_linear_final$byClass[,'F1'], na.rm = TRUE),
mean(cm_rbf_final$byClass[,'F1'], na.rm = TRUE))
)
comparison_data %>%
mutate_if(is.numeric, round, 4) %>%
kable(caption = "Perbandingan Performa Model SVM") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| Model | Accuracy | Precision | Recall | F1_Score |
|---|---|---|---|---|
| Linear SVM | 0.9538 | 0.9500 | 0.9372 | 0.9431 |
| RBF SVM | 0.9692 | 0.9629 | 0.9629 | 0.9629 |
# Linear SVM class-wise metrics
linear_metrics <- data.frame(
Species = rownames(cm_linear_final$byClass),
Model = "Linear SVM",
Precision = cm_linear_final$byClass[,'Precision'],
Recall = cm_linear_final$byClass[,'Recall'],
F1_Score = cm_linear_final$byClass[,'F1']
)
# RBF SVM class-wise metrics
rbf_metrics <- data.frame(
Species = rownames(cm_rbf_final$byClass),
Model = "RBF SVM",
Precision = cm_rbf_final$byClass[,'Precision'],
Recall = cm_rbf_final$byClass[,'Recall'],
F1_Score = cm_rbf_final$byClass[,'F1']
)
# Combine and display
class_comparison <- rbind(linear_metrics, rbf_metrics)
class_comparison %>%
mutate_if(is.numeric, round, 4) %>%
kable(caption = "Performa Per-Class untuk Setiap Model") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))| Species | Model | Precision | Recall | F1_Score | |
|---|---|---|---|---|---|
| Class: Adelie | Class: Adelie | Linear SVM | 0.9333 | 0.9655 | 0.9492 |
| Class: Chinstrap | Class: Chinstrap | Linear SVM | 0.9167 | 0.8462 | 0.8800 |
| Class: Gentoo | Class: Gentoo | Linear SVM | 1.0000 | 1.0000 | 1.0000 |
| Class: Adelie1 | Class: Adelie | RBF SVM | 0.9655 | 0.9655 | 0.9655 |
| Class: Chinstrap1 | Class: Chinstrap | RBF SVM | 0.9231 | 0.9231 | 0.9231 |
| Class: Gentoo1 | Class: Gentoo | RBF SVM | 1.0000 | 1.0000 | 1.0000 |
# Since we have 4 features, we'll use PCA for 2D visualization
pca_result <- prcomp(train_data[, -1], scale = TRUE)
# Create 2D data
train_2d <- data.frame(
PC1 = pca_result$x[,1],
PC2 = pca_result$x[,2],
species = train_data$species
)
test_2d <- data.frame(
PC1 = predict(pca_result, test_data[, -1])[,1],
PC2 = predict(pca_result, test_data[, -1])[,2],
species = test_data$species
)
# Variance explained
var_explained <- summary(pca_result)$importance[2,]
cat("PC1 explains", round(var_explained[1]*100, 2), "% of variance\n")## PC1 explains 68.37 % of variance
## PC2 explains 19.61 % of variance
## Total variance explained by PC1+PC2: 87.98 %
# Train SVM models on 2D data
svm_2d_linear <- svm(species ~ ., data = train_2d, kernel = "linear", scale = TRUE)
svm_2d_rbf <- svm(species ~ ., data = train_2d, kernel = "radial", scale = TRUE)
# Create prediction grid
grid_resolution <- 100
pc1_range <- range(train_2d$PC1)
pc2_range <- range(train_2d$PC2)
pc1_seq <- seq(pc1_range[1] - 1, pc1_range[2] + 1, length.out = grid_resolution)
pc2_seq <- seq(pc2_range[1] - 1, pc2_range[2] + 1, length.out = grid_resolution)
grid <- expand.grid(PC1 = pc1_seq, PC2 = pc2_seq)
# Predictions on grid
grid$linear_pred <- predict(svm_2d_linear, grid)
grid$rbf_pred <- predict(svm_2d_rbf, grid)
# Plot Linear SVM Decision Boundary
p_linear_boundary <- ggplot() +
geom_point(data = grid, aes(x = PC1, y = PC2, color = linear_pred),
alpha = 0.3, size = 0.5) +
geom_point(data = train_2d, aes(x = PC1, y = PC2, color = species),
size = 2, alpha = 0.8) +
scale_color_viridis_d() +
theme_minimal() +
labs(title = "Linear SVM Decision Boundary (2D PCA Space)",
x = paste("PC1 (", round(var_explained[1]*100, 1), "% variance)"),
y = paste("PC2 (", round(var_explained[2]*100, 1), "% variance)"),
color = "Species") +
theme(legend.position = "bottom")
# Plot RBF SVM Decision Boundary
p_rbf_boundary <- ggplot() +
geom_point(data = grid, aes(x = PC1, y = PC2, color = rbf_pred),
alpha = 0.3, size = 0.5) +
geom_point(data = train_2d, aes(x = PC1, y = PC2, color = species),
size = 2, alpha = 0.8) +
scale_color_viridis_d() +
theme_minimal() +
labs(title = "RBF SVM Decision Boundary (2D PCA Space)",
x = paste("PC1 (", round(var_explained[1]*100, 1), "% variance)"),
y = paste("PC2 (", round(var_explained[2]*100, 1), "% variance)"),
color = "Species") +
theme(legend.position = "bottom")
# Display plots
print(p_linear_boundary)# Test different C values
c_values <- c(0.01, 0.1, 1, 10, 100)
c_results <- data.frame()
for(c_val in c_values) {
svm_temp <- svm(species ~ ., data = train_data, kernel = "linear",
cost = c_val, scale = TRUE)
pred_temp <- predict(svm_temp, test_data)
acc_temp <- confusionMatrix(pred_temp, test_data$species)$overall['Accuracy']
c_results <- rbind(c_results, data.frame(C = c_val, Accuracy = acc_temp))
}
# Plot C parameter effect
p_c_effect <- ggplot(c_results, aes(x = log10(C), y = Accuracy)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
theme_minimal() +
labs(title = "Pengaruh Parameter C terhadap Akurasi (Linear SVM)",
x = "log10(C)", y = "Test Accuracy") +
scale_x_continuous(breaks = log10(c_values), labels = c_values)
print(p_c_effect)# Display results table
c_results %>%
mutate(Accuracy = round(Accuracy, 4)) %>%
kable(caption = "Pengaruh Parameter C pada Linear SVM") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| C | Accuracy | |
|---|---|---|
| Accuracy | 1e-02 | 0.9385 |
| Accuracy1 | 1e-01 | 0.9385 |
| Accuracy2 | 1e+00 | 0.9692 |
| Accuracy3 | 1e+01 | 0.9538 |
| Accuracy4 | 1e+02 | 0.9692 |
# Test different gamma values (with fixed optimal C)
gamma_values <- c(0.001, 0.01, 0.1, 1, 10)
gamma_results <- data.frame()
for(gamma_val in gamma_values) {
svm_temp <- svm(species ~ ., data = train_data, kernel = "radial",
cost = tune_rbf$best.parameters$cost,
gamma = gamma_val, scale = TRUE)
pred_temp <- predict(svm_temp, test_data)
acc_temp <- confusionMatrix(pred_temp, test_data$species)$overall['Accuracy']
gamma_results <- rbind(gamma_results, data.frame(Gamma = gamma_val, Accuracy = acc_temp))
}
# Plot gamma parameter effect
p_gamma_effect <- ggplot(gamma_results, aes(x = log10(Gamma), y = Accuracy)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 3) +
theme_minimal() +
labs(title = "Pengaruh Parameter Gamma terhadap Akurasi (RBF SVM)",
x = "log10(Gamma)", y = "Test Accuracy") +
scale_x_continuous(breaks = log10(gamma_values), labels = gamma_values)
print(p_gamma_effect)# Display results table
gamma_results %>%
mutate(Accuracy = round(Accuracy, 4)) %>%
kable(caption = "Pengaruh Parameter Gamma pada RBF SVM") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Gamma | Accuracy | |
|---|---|---|
| Accuracy | 1e-03 | 0.9538 |
| Accuracy1 | 1e-02 | 0.9538 |
| Accuracy2 | 1e-01 | 0.9692 |
| Accuracy3 | 1e+00 | 0.9385 |
| Accuracy4 | 1e+01 | 0.8462 |
# For linear SVM, we can examine the coefficients
w <- t(svm_linear_final$coefs) %*% svm_linear_final$SV
# Create feature importance dataframe
feature_names <- colnames(train_data)[-1]
feature_importance <- data.frame(
Feature = feature_names,
Importance = abs(as.numeric(w))
) %>%
arrange(desc(Importance))
feature_importance %>%
mutate(Importance = round(Importance, 4)) %>%
kable(caption = "Feature Importance (Linear SVM)") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Feature | Importance |
|---|---|
| bill_length_mm | 26.0013 |
| bill_depth_mm | 25.2459 |
| flipper_length_mm | 18.3424 |
| body_mass_g | 17.6635 |
| bill_depth_mm | 9.8024 |
| body_mass_g | 6.9998 |
| flipper_length_mm | 4.7038 |
| bill_length_mm | 3.9562 |
# Visualize feature importance
p_importance <- ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance dalam Linear SVM",
x = "Features", y = "Absolute Coefficient Value")
print(p_importance)# Perform k-fold cross-validation
set.seed(123)
cv_folds <- 10
# Linear SVM CV
cv_linear <- train(species ~ .,
data = train_data,
method = "svmLinear",
trControl = trainControl(method = "cv", number = cv_folds),
tuneGrid = data.frame(C = tune_linear$best.parameters$cost),
preProcess = c("center", "scale"))
# RBF SVM CV
cv_rbf <- train(species ~ .,
data = train_data,
method = "svmRadial",
trControl = trainControl(method = "cv", number = cv_folds),
tuneGrid = data.frame(C = tune_rbf$best.parameters$cost,
sigma = tune_rbf$best.parameters$gamma),
preProcess = c("center", "scale"))
# Results
cv_results <- data.frame(
Model = c("Linear SVM", "RBF SVM"),
CV_Accuracy = c(cv_linear$results$Accuracy, cv_rbf$results$Accuracy),
CV_Std = c(cv_linear$results$AccuracySD, cv_rbf$results$AccuracySD)
)
cv_results %>%
mutate_if(is.numeric, round, 4) %>%
kable(caption = "Cross-Validation Results") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Model | CV_Accuracy | CV_Std |
|---|---|---|
| Linear SVM | 1.0000 | 0.0000 |
| RBF SVM | 0.9926 | 0.0156 |
## === SUPPORT VECTORS ANALYSIS ===
## Linear SVM:
## Number of Support Vectors: 12
## Total Training Samples: 268
## Support Vector Ratio: 0.0448
## RBF SVM:
## Number of Support Vectors: 16
## Total Training Samples: 268
## Support Vector Ratio: 0.0597
# Support vectors by class
sv_by_class_linear <- table(svm_linear_final$fitted)
sv_by_class_rbf <- table(svm_rbf_final$fitted)
sv_comparison <- data.frame(
Species = names(sv_by_class_linear),
Linear_SVM_SV = as.numeric(sv_by_class_linear),
RBF_SVM_SV = as.numeric(sv_by_class_rbf)
)
sv_comparison %>%
kable(caption = "Support Vectors per Species") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Species | Linear_SVM_SV | RBF_SVM_SV |
|---|---|---|
| Adelie | 117 | 117 |
| Chinstrap | 55 | 55 |
| Gentoo | 96 | 96 |
# Analyze misclassifications
test_results <- data.frame(
Actual = test_data$species,
Linear_Pred = pred_linear_final,
RBF_Pred = pred_rbf_final,
Linear_Correct = pred_linear_final == test_data$species,
RBF_Correct = pred_rbf_final == test_data$species
)
# Count misclassifications
linear_errors <- sum(!test_results$Linear_Correct)
rbf_errors <- sum(!test_results$RBF_Correct)
cat("Linear SVM Misclassifications:", linear_errors, "out of", nrow(test_data), "\n")## Linear SVM Misclassifications: 3 out of 65
## RBF SVM Misclassifications: 2 out of 65
# Show misclassified cases
if(linear_errors > 0) {
cat("\n=== LINEAR SVM MISCLASSIFICATIONS ===\n")
misclassified_linear <- test_results[!test_results$Linear_Correct, ]
print(misclassified_linear)
}##
## === LINEAR SVM MISCLASSIFICATIONS ===
## Actual Linear_Pred RBF_Pred Linear_Correct RBF_Correct
## 17 Adelie Chinstrap Chinstrap FALSE FALSE
## 56 Chinstrap Adelie Adelie FALSE FALSE
## 64 Chinstrap Adelie Chinstrap FALSE TRUE
if(rbf_errors > 0) {
cat("\n=== RBF SVM MISCLASSIFICATIONS ===\n")
misclassified_rbf <- test_results[!test_results$RBF_Correct, ]
print(misclassified_rbf)
}##
## === RBF SVM MISCLASSIFICATIONS ===
## Actual Linear_Pred RBF_Pred Linear_Correct RBF_Correct
## 17 Adelie Chinstrap Chinstrap FALSE FALSE
## 56 Chinstrap Adelie Adelie FALSE FALSE
Berdasarkan analisis yang telah dilakukan, berikut adalah ringkasan hasil utama:
Berdasarkan hasil analisis: - RBF SVM menunjukkan performa yang lebih baik - Dataset penguin memiliki beberapa non-linear relationships yang dapat ditangkap lebih baik oleh RBF kernel - Support vector ratio yang lebih tinggi pada RBF menunjukkan efisiensi model
Dari analisis feature importance pada Linear SVM: 1. bill_length_mm adalah fitur paling penting 2. bill_depth_mm menunjukkan kontribusi signifikan kedua 3. Semua fitur berkontribusi dalam klasifikasi, menunjukkan relevansi semua pengukuran morfologi
Visualisasi decision boundary dalam ruang PCA menunjukkan: - Linear SVM: Membentuk garis pemisah linear yang jelas - RBF SVM: Menghasilkan boundary yang lebih complex dan fleksibel - Separabilitas: Dataset penguin relatif well-separated, menjelaskan performa tinggi kedua model
palmerpenguins in R## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.3.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Jakarta
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.50 plotly_4.10.4
## [4] corrplot_0.92 viridis_0.6.3 viridisLite_0.4.2
## [7] gridExtra_2.3 caret_6.0-94 lattice_0.21-8
## [10] e1071_1.7-13 lubridate_1.9.2 forcats_1.0.0
## [13] stringr_1.5.0 dplyr_1.1.4 purrr_1.0.1
## [16] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
## [19] ggplot2_3.5.2 tidyverse_2.0.0 palmerpenguins_0.1.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.0 timeDate_4022.108 farver_2.1.1
## [4] lazyeval_0.2.2 fastmap_1.1.1 pROC_1.18.4
## [7] digest_0.6.31 rpart_4.1.19 timechange_0.2.0
## [10] lifecycle_1.0.3 survival_3.5-5 kernlab_0.9-32
## [13] magrittr_2.0.3 compiler_4.3.0 rlang_1.1.1
## [16] sass_0.4.6 tools_4.3.0 utf8_1.2.3
## [19] yaml_2.3.7 data.table_1.14.8 labeling_0.4.2
## [22] htmlwidgets_1.6.2 xml2_1.3.4 plyr_1.8.8
## [25] withr_2.5.0 nnet_7.3-18 grid_4.3.0
## [28] stats4_4.3.0 fansi_1.0.4 colorspace_2.1-0
## [31] future_1.33.0 globals_0.16.2 scales_1.3.0
## [34] iterators_1.0.14 MASS_7.3-60 cli_3.6.1
## [37] rmarkdown_2.22 generics_0.1.3 rstudioapi_0.14
## [40] future.apply_1.11.0 httr_1.4.6 reshape2_1.4.4
## [43] tzdb_0.4.0 cachem_1.0.8 proxy_0.4-27
## [46] splines_4.3.0 parallel_4.3.0 vctrs_0.6.4
## [49] hardhat_1.3.0 Matrix_1.5-4 jsonlite_1.8.4
## [52] hms_1.1.3 listenv_0.9.0 systemfonts_1.0.4
## [55] foreach_1.5.2 gower_1.0.1 jquerylib_0.1.4
## [58] recipes_1.0.8 glue_1.6.2 parallelly_1.36.0
## [61] codetools_0.2-19 stringi_1.7.12 gtable_0.3.3
## [64] munsell_0.5.0 pillar_1.9.0 htmltools_0.5.6
## [67] ipred_0.9-14 lava_1.7.2.1 R6_2.5.1
## [70] evaluate_0.21 bslib_0.4.2 class_7.3-21
## [73] Rcpp_1.0.10 svglite_2.1.3 nlme_3.1-166
## [76] prodlim_2023.08.28 xfun_0.52 pkgconfig_2.0.3
## [79] ModelMetrics_1.2.2.2
Catatan: Laporan ini dibuat dengan R Markdown untuk memastikan reproducibility dan transparansi dalam analisis. Semua kode dan visualisasi dapat direplikasi dengan menjalankan script yang disediakan.
Laporan Tugas SVM - Klasifikasi Spesies Penguin
Mata Kuliah: Machine Learning
Tanggal: 2025-06-02