1. Pendahuluan

Support Vector Machine (SVM) adalah salah satu algoritma machine learning yang paling powerful untuk klasifikasi dan regresi. Pada tugas ini, kami akan menggunakan dataset penguin untuk membandingkan performa SVM linear dan nonlinear dalam mengklasifikasikan spesies penguin berdasarkan karakteristik fisik mereka.

1.1 Tujuan Penelitian

  1. Memahami dan menerapkan algoritma SVM untuk klasifikasi
  2. Membandingkan performa SVM linear vs nonlinear (kernel RBF)
  3. Menganalisis pengaruh hyperparameter C dan gamma
  4. Memvisualisasikan decision boundary untuk pemahaman yang lebih baik

2. Persiapan Data dan Library

# Load required libraries
library(palmerpenguins)
library(tidyverse)
library(e1071)
library(caret)
library(gridExtra)
library(viridis)
library(corrplot)
library(plotly)
library(knitr)
library(kableExtra)

# Set seed for reproducibility
set.seed(123)
# Load penguin dataset
data(penguins)

# Display basic information about the dataset
cat("Dataset Dimensions:", dim(penguins), "\n")
## Dataset Dimensions: 344 8
cat("Number of missing values:", sum(is.na(penguins)), "\n")
## Number of missing values: 19
# Show first few rows
head(penguins) %>%
  kable(caption = "Dataset Penguin - 6 Baris Pertama") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Dataset Penguin - 6 Baris Pertama
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
Adelie Torgersen 39.1 18.7 181 3750 male 2007
Adelie Torgersen 39.5 17.4 186 3800 female 2007
Adelie Torgersen 40.3 18.0 195 3250 female 2007
Adelie Torgersen NA NA NA NA NA 2007
Adelie Torgersen 36.7 19.3 193 3450 female 2007
Adelie Torgersen 39.3 20.6 190 3650 male 2007

3. Eksplorasi Data Awal

3.1 Struktur Dataset

# Dataset structure
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
# Summary statistics
summary(penguins) %>%
  kable(caption = "Statistik Deskriptif Dataset") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Statistik Deskriptif Dataset
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex year
Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10 Min. :172.0 Min. :2700 female:165 Min. :2007
Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30 Median :197.0 Median :4050 NA’s : 11 Median :2008
NA NA Mean :43.92 Mean :17.15 Mean :200.9 Mean :4202 NA Mean :2008
NA NA 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0 3rd Qu.:4750 NA 3rd Qu.:2009
NA NA Max. :59.60 Max. :21.50 Max. :231.0 Max. :6300 NA Max. :2009
NA NA NA’s :2 NA’s :2 NA’s :2 NA’s :2 NA NA

3.2 Handling Missing Values

# Check missing values pattern
missing_pattern <- penguins %>%
  summarise_all(~sum(is.na(.))) %>%
  gather(key = "Variable", value = "Missing_Count") %>%
  mutate(Missing_Percentage = round(Missing_Count/nrow(penguins)*100, 2))

missing_pattern %>%
  kable(caption = "Pola Missing Values") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Pola Missing Values
Variable Missing_Count Missing_Percentage
species 0 0.00
island 0 0.00
bill_length_mm 2 0.58
bill_depth_mm 2 0.58
flipper_length_mm 2 0.58
body_mass_g 2 0.58
sex 11 3.20
year 0 0.00
# Remove rows with missing values
penguins_clean <- penguins %>%
  drop_na()

cat("Data setelah pembersihan:", nrow(penguins_clean), "observasi")
## Data setelah pembersihan: 333 observasi

3.3 Distribusi Spesies

# Species distribution
species_count <- penguins_clean %>%
  count(species) %>%
  mutate(percentage = round(n/sum(n)*100, 1))

species_count %>%
  kable(caption = "Distribusi Spesies", 
        col.names = c("Species", "Count", "Percentage (%)")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Distribusi Spesies
Species Count Percentage (%)
Adelie 146 43.8
Chinstrap 68 20.4
Gentoo 119 35.7
# Visualization
p1 <- ggplot(penguins_clean, aes(x = species, fill = species)) +
  geom_bar(alpha = 0.8) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Distribusi Spesies Penguin",
       x = "Species", y = "Count") +
  theme(legend.position = "none")

print(p1)

3.4 Analisis Korelasi Variabel Numerik

# Select numeric variables
numeric_vars <- penguins_clean %>%
  select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)

# Correlation matrix
cor_matrix <- cor(numeric_vars)

# Visualization
corrplot(cor_matrix, method = "color", type = "upper", 
         order = "hclust", tl.cex = 0.8, tl.col = "black",
         title = "Matriks Korelasi Variabel Numerik",
         mar = c(0,0,1,0))

3.5 Visualisasi Distribusi Variabel

# Create distribution plots for all numeric variables
p_bill_length <- ggplot(penguins_clean, aes(x = bill_length_mm, fill = species)) +
  geom_density(alpha = 0.7) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Bill Length Distribution")

p_bill_depth <- ggplot(penguins_clean, aes(x = bill_depth_mm, fill = species)) +
  geom_density(alpha = 0.7) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Bill Depth Distribution")

p_flipper <- ggplot(penguins_clean, aes(x = flipper_length_mm, fill = species)) +
  geom_density(alpha = 0.7) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Flipper Length Distribution")

p_mass <- ggplot(penguins_clean, aes(x = body_mass_g, fill = species)) +
  geom_density(alpha = 0.7) +
  scale_fill_viridis_d() +
  theme_minimal() +
  labs(title = "Body Mass Distribution")

# Combine plots
grid.arrange(p_bill_length, p_bill_depth, p_flipper, p_mass, ncol = 2)

3.6 Scatter Plot Matrix

# Create scatter plot for key variables
p_scatter <- ggplot(penguins_clean, aes(x = bill_length_mm, y = bill_depth_mm, 
                                       color = species, size = body_mass_g)) +
  geom_point(alpha = 0.7) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(title = "Relationship between Bill Dimensions",
       x = "Bill Length (mm)", y = "Bill Depth (mm)",
       size = "Body Mass (g)", color = "Species")

print(p_scatter)

4. Persiapan Data untuk Modeling

# Prepare data for modeling
# Select relevant features and target
modeling_data <- penguins_clean %>%
  select(species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)

# Check for any remaining issues
summary(modeling_data)
##       species    bill_length_mm  bill_depth_mm   flipper_length_mm
##  Adelie   :146   Min.   :32.10   Min.   :13.10   Min.   :172      
##  Chinstrap: 68   1st Qu.:39.50   1st Qu.:15.60   1st Qu.:190      
##  Gentoo   :119   Median :44.50   Median :17.30   Median :197      
##                  Mean   :43.99   Mean   :17.16   Mean   :201      
##                  3rd Qu.:48.60   3rd Qu.:18.70   3rd Qu.:213      
##                  Max.   :59.60   Max.   :21.50   Max.   :231      
##   body_mass_g  
##  Min.   :2700  
##  1st Qu.:3550  
##  Median :4050  
##  Mean   :4207  
##  3rd Qu.:4775  
##  Max.   :6300
# Create train-test split
train_index <- createDataPartition(modeling_data$species, p = 0.8, list = FALSE)
train_data <- modeling_data[train_index, ]
test_data <- modeling_data[-train_index, ]

cat("Training set size:", nrow(train_data), "\n")
## Training set size: 268
cat("Test set size:", nrow(test_data), "\n")
## Test set size: 65
# Display class distribution in train and test sets
train_dist <- table(train_data$species)
test_dist <- table(test_data$species)

distribution_table <- data.frame(
  Species = names(train_dist),
  Train = as.numeric(train_dist),
  Test = as.numeric(test_dist),
  Train_Prop = round(prop.table(train_dist), 3),
  Test_Prop = round(prop.table(test_dist), 3)
)

distribution_table %>%
  kable(caption = "Distribusi Class pada Train dan Test Set") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Distribusi Class pada Train dan Test Set
Species Train Test Train_Prop.Var1 Train_Prop.Freq Test_Prop.Var1 Test_Prop.Freq
Adelie 117 29 Adelie 0.437 Adelie 0.446
Chinstrap 55 13 Chinstrap 0.205 Chinstrap 0.200
Gentoo 96 23 Gentoo 0.358 Gentoo 0.354

5. Model SVM Linear

5.1 Training Model Linear SVM

# Train linear SVM
svm_linear <- svm(species ~ ., 
                  data = train_data, 
                  kernel = "linear", 
                  cost = 1,
                  scale = TRUE)

# Model summary
summary(svm_linear)
## 
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "linear", 
##     cost = 1, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  20
## 
##  ( 9 3 8 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  Adelie Chinstrap Gentoo

5.2 Prediksi dan Evaluasi Linear SVM

# Predictions
pred_linear_train <- predict(svm_linear, train_data)
pred_linear_test <- predict(svm_linear, test_data)

# Confusion matrices
cm_linear_train <- confusionMatrix(pred_linear_train, train_data$species)
cm_linear_test <- confusionMatrix(pred_linear_test, test_data$species)

# Display results
cat("=== LINEAR SVM PERFORMANCE ===\n")
## === LINEAR SVM PERFORMANCE ===
cat("Training Accuracy:", round(cm_linear_train$overall['Accuracy'], 4), "\n")
## Training Accuracy: 0.9925
cat("Test Accuracy:", round(cm_linear_test$overall['Accuracy'], 4), "\n\n")
## Test Accuracy: 0.9692
# Detailed metrics for test set
print("Test Set Detailed Metrics:")
## [1] "Test Set Detailed Metrics:"
print(cm_linear_test)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        29         2      0
##   Chinstrap      0        11      0
##   Gentoo         0         0     23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9692          
##                  95% CI : (0.8932, 0.9963)
##     No Information Rate : 0.4462          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.951           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 1.0000           0.8462        1.0000
## Specificity                 0.9444           1.0000        1.0000
## Pos Pred Value              0.9355           1.0000        1.0000
## Neg Pred Value              1.0000           0.9630        1.0000
## Prevalence                  0.4462           0.2000        0.3538
## Detection Rate              0.4462           0.1692        0.3538
## Detection Prevalence        0.4769           0.1692        0.3538
## Balanced Accuracy           0.9722           0.9231        1.0000

5.3 Hyperparameter Tuning untuk Linear SVM

# Hyperparameter tuning for linear SVM
tune_linear <- tune(svm, species ~ ., 
                    data = train_data,
                    kernel = "linear",
                    ranges = list(cost = c(0.01, 0.1, 1, 10, 100)),
                    scale = TRUE)

# Best parameters
cat("Best Cost parameter for Linear SVM:", tune_linear$best.parameters$cost, "\n")
## Best Cost parameter for Linear SVM: 10
cat("Best CV Accuracy:", round(1 - tune_linear$best.performance, 4), "\n")
## Best CV Accuracy: 1
# Plot tuning results
plot(tune_linear, main = "Linear SVM Hyperparameter Tuning")

5.4 Final Linear SVM Model

# Train final linear SVM with best parameters
svm_linear_final <- svm(species ~ ., 
                        data = train_data, 
                        kernel = "linear", 
                        cost = tune_linear$best.parameters$cost,
                        scale = TRUE)

# Final predictions
pred_linear_final <- predict(svm_linear_final, test_data)
cm_linear_final <- confusionMatrix(pred_linear_final, test_data$species)

cat("=== FINAL LINEAR SVM PERFORMANCE ===\n")
## === FINAL LINEAR SVM PERFORMANCE ===
print(cm_linear_final)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        28         2      0
##   Chinstrap      1        11      0
##   Gentoo         0         0     23
## 
## Overall Statistics
##                                          
##                Accuracy : 0.9538         
##                  95% CI : (0.871, 0.9904)
##     No Information Rate : 0.4462         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.927          
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9655           0.8462        1.0000
## Specificity                 0.9444           0.9808        1.0000
## Pos Pred Value              0.9333           0.9167        1.0000
## Neg Pred Value              0.9714           0.9623        1.0000
## Prevalence                  0.4462           0.2000        0.3538
## Detection Rate              0.4308           0.1692        0.3538
## Detection Prevalence        0.4615           0.1846        0.3538
## Balanced Accuracy           0.9550           0.9135        1.0000

6. Model SVM Nonlinear (RBF Kernel)

6.1 Training Model RBF SVM

# Train RBF SVM
svm_rbf <- svm(species ~ ., 
               data = train_data, 
               kernel = "radial", 
               cost = 1,
               gamma = 1,
               scale = TRUE)

# Model summary
summary(svm_rbf)
## 
## Call:
## svm(formula = species ~ ., data = train_data, kernel = "radial", 
##     cost = 1, gamma = 1, scale = TRUE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  71
## 
##  ( 29 22 20 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  Adelie Chinstrap Gentoo

6.2 Hyperparameter Tuning untuk RBF SVM

# Hyperparameter tuning for RBF SVM
tune_rbf <- tune(svm, species ~ ., 
                 data = train_data,
                 kernel = "radial",
                 ranges = list(cost = c(0.1, 1, 10, 100),
                              gamma = c(0.01, 0.1, 1, 10)),
                 scale = TRUE)

# Best parameters
cat("Best parameters for RBF SVM:\n")
## Best parameters for RBF SVM:
cat("Cost:", tune_rbf$best.parameters$cost, "\n")
## Cost: 100
cat("Gamma:", tune_rbf$best.parameters$gamma, "\n")
## Gamma: 0.1
cat("Best CV Accuracy:", round(1 - tune_rbf$best.performance, 4), "\n")
## Best CV Accuracy: 1
# Plot tuning results
plot(tune_rbf, main = "RBF SVM Hyperparameter Tuning")

6.3 Final RBF SVM Model

# Train final RBF SVM with best parameters
svm_rbf_final <- svm(species ~ ., 
                     data = train_data, 
                     kernel = "radial", 
                     cost = tune_rbf$best.parameters$cost,
                     gamma = tune_rbf$best.parameters$gamma,
                     scale = TRUE)

# Final predictions
pred_rbf_final <- predict(svm_rbf_final, test_data)
cm_rbf_final <- confusionMatrix(pred_rbf_final, test_data$species)

cat("=== FINAL RBF SVM PERFORMANCE ===\n")
## === FINAL RBF SVM PERFORMANCE ===
print(cm_rbf_final)
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        28         1      0
##   Chinstrap      1        12      0
##   Gentoo         0         0     23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9692          
##                  95% CI : (0.8932, 0.9963)
##     No Information Rate : 0.4462          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9516          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9655           0.9231        1.0000
## Specificity                 0.9722           0.9808        1.0000
## Pos Pred Value              0.9655           0.9231        1.0000
## Neg Pred Value              0.9722           0.9808        1.0000
## Prevalence                  0.4462           0.2000        0.3538
## Detection Rate              0.4308           0.1846        0.3538
## Detection Prevalence        0.4462           0.2000        0.3538
## Balanced Accuracy           0.9689           0.9519        1.0000

7. Perbandingan Model

7.1 Ringkasan Performa Model

# Create comparison table
comparison_data <- data.frame(
  Model = c("Linear SVM", "RBF SVM"),
  Accuracy = c(cm_linear_final$overall['Accuracy'], 
               cm_rbf_final$overall['Accuracy']),
  Precision = c(mean(cm_linear_final$byClass[,'Precision'], na.rm = TRUE),
                mean(cm_rbf_final$byClass[,'Precision'], na.rm = TRUE)),
  Recall = c(mean(cm_linear_final$byClass[,'Recall'], na.rm = TRUE),
             mean(cm_rbf_final$byClass[,'Recall'], na.rm = TRUE)),
  F1_Score = c(mean(cm_linear_final$byClass[,'F1'], na.rm = TRUE),
               mean(cm_rbf_final$byClass[,'F1'], na.rm = TRUE))
)

comparison_data %>%
  mutate_if(is.numeric, round, 4) %>%
  kable(caption = "Perbandingan Performa Model SVM") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Perbandingan Performa Model SVM
Model Accuracy Precision Recall F1_Score
Linear SVM 0.9538 0.9500 0.9372 0.9431
RBF SVM 0.9692 0.9629 0.9629 0.9629

7.2 Detailed Class-wise Performance

# Linear SVM class-wise metrics
linear_metrics <- data.frame(
  Species = rownames(cm_linear_final$byClass),
  Model = "Linear SVM",
  Precision = cm_linear_final$byClass[,'Precision'],
  Recall = cm_linear_final$byClass[,'Recall'],
  F1_Score = cm_linear_final$byClass[,'F1']
)

# RBF SVM class-wise metrics  
rbf_metrics <- data.frame(
  Species = rownames(cm_rbf_final$byClass),
  Model = "RBF SVM",
  Precision = cm_rbf_final$byClass[,'Precision'],
  Recall = cm_rbf_final$byClass[,'Recall'],
  F1_Score = cm_rbf_final$byClass[,'F1']
)

# Combine and display
class_comparison <- rbind(linear_metrics, rbf_metrics)
class_comparison %>%
  mutate_if(is.numeric, round, 4) %>%
  kable(caption = "Performa Per-Class untuk Setiap Model") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Performa Per-Class untuk Setiap Model
Species Model Precision Recall F1_Score
Class: Adelie Class: Adelie Linear SVM 0.9333 0.9655 0.9492
Class: Chinstrap Class: Chinstrap Linear SVM 0.9167 0.8462 0.8800
Class: Gentoo Class: Gentoo Linear SVM 1.0000 1.0000 1.0000
Class: Adelie1 Class: Adelie RBF SVM 0.9655 0.9655 0.9655
Class: Chinstrap1 Class: Chinstrap RBF SVM 0.9231 0.9231 0.9231
Class: Gentoo1 Class: Gentoo RBF SVM 1.0000 1.0000 1.0000

8. Visualisasi Decision Boundary

8.1 2D Visualization dengan PCA

# Since we have 4 features, we'll use PCA for 2D visualization
pca_result <- prcomp(train_data[, -1], scale = TRUE)

# Create 2D data
train_2d <- data.frame(
  PC1 = pca_result$x[,1],
  PC2 = pca_result$x[,2],
  species = train_data$species
)

test_2d <- data.frame(
  PC1 = predict(pca_result, test_data[, -1])[,1],
  PC2 = predict(pca_result, test_data[, -1])[,2],
  species = test_data$species
)

# Variance explained
var_explained <- summary(pca_result)$importance[2,]
cat("PC1 explains", round(var_explained[1]*100, 2), "% of variance\n")
## PC1 explains 68.37 % of variance
cat("PC2 explains", round(var_explained[2]*100, 2), "% of variance\n")
## PC2 explains 19.61 % of variance
cat("Total variance explained by PC1+PC2:", round(sum(var_explained[1:2])*100, 2), "%\n")
## Total variance explained by PC1+PC2: 87.98 %

8.2 Decision Boundary Visualization

# Train SVM models on 2D data
svm_2d_linear <- svm(species ~ ., data = train_2d, kernel = "linear", scale = TRUE)
svm_2d_rbf <- svm(species ~ ., data = train_2d, kernel = "radial", scale = TRUE)

# Create prediction grid
grid_resolution <- 100
pc1_range <- range(train_2d$PC1)
pc2_range <- range(train_2d$PC2)

pc1_seq <- seq(pc1_range[1] - 1, pc1_range[2] + 1, length.out = grid_resolution)
pc2_seq <- seq(pc2_range[1] - 1, pc2_range[2] + 1, length.out = grid_resolution)

grid <- expand.grid(PC1 = pc1_seq, PC2 = pc2_seq)

# Predictions on grid
grid$linear_pred <- predict(svm_2d_linear, grid)
grid$rbf_pred <- predict(svm_2d_rbf, grid)

# Plot Linear SVM Decision Boundary
p_linear_boundary <- ggplot() +
  geom_point(data = grid, aes(x = PC1, y = PC2, color = linear_pred), 
             alpha = 0.3, size = 0.5) +
  geom_point(data = train_2d, aes(x = PC1, y = PC2, color = species), 
             size = 2, alpha = 0.8) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(title = "Linear SVM Decision Boundary (2D PCA Space)",
       x = paste("PC1 (", round(var_explained[1]*100, 1), "% variance)"),
       y = paste("PC2 (", round(var_explained[2]*100, 1), "% variance)"),
       color = "Species") +
  theme(legend.position = "bottom")

# Plot RBF SVM Decision Boundary
p_rbf_boundary <- ggplot() +
  geom_point(data = grid, aes(x = PC1, y = PC2, color = rbf_pred), 
             alpha = 0.3, size = 0.5) +
  geom_point(data = train_2d, aes(x = PC1, y = PC2, color = species), 
             size = 2, alpha = 0.8) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(title = "RBF SVM Decision Boundary (2D PCA Space)",
       x = paste("PC1 (", round(var_explained[1]*100, 1), "% variance)"),
       y = paste("PC2 (", round(var_explained[2]*100, 1), "% variance)"),
       color = "Species") +
  theme(legend.position = "bottom")

# Display plots
print(p_linear_boundary)

print(p_rbf_boundary)

9. Analisis Hyperparameter

9.1 Pengaruh Parameter C

# Test different C values
c_values <- c(0.01, 0.1, 1, 10, 100)
c_results <- data.frame()

for(c_val in c_values) {
  svm_temp <- svm(species ~ ., data = train_data, kernel = "linear", 
                  cost = c_val, scale = TRUE)
  pred_temp <- predict(svm_temp, test_data)
  acc_temp <- confusionMatrix(pred_temp, test_data$species)$overall['Accuracy']
  
  c_results <- rbind(c_results, data.frame(C = c_val, Accuracy = acc_temp))
}

# Plot C parameter effect
p_c_effect <- ggplot(c_results, aes(x = log10(C), y = Accuracy)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 3) +
  theme_minimal() +
  labs(title = "Pengaruh Parameter C terhadap Akurasi (Linear SVM)",
       x = "log10(C)", y = "Test Accuracy") +
  scale_x_continuous(breaks = log10(c_values), labels = c_values)

print(p_c_effect)

# Display results table
c_results %>%
  mutate(Accuracy = round(Accuracy, 4)) %>%
  kable(caption = "Pengaruh Parameter C pada Linear SVM") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Pengaruh Parameter C pada Linear SVM
C Accuracy
Accuracy 1e-02 0.9385
Accuracy1 1e-01 0.9385
Accuracy2 1e+00 0.9692
Accuracy3 1e+01 0.9538
Accuracy4 1e+02 0.9692

9.2 Pengaruh Parameter Gamma (RBF)

# Test different gamma values (with fixed optimal C)
gamma_values <- c(0.001, 0.01, 0.1, 1, 10)
gamma_results <- data.frame()

for(gamma_val in gamma_values) {
  svm_temp <- svm(species ~ ., data = train_data, kernel = "radial", 
                  cost = tune_rbf$best.parameters$cost, 
                  gamma = gamma_val, scale = TRUE)
  pred_temp <- predict(svm_temp, test_data)
  acc_temp <- confusionMatrix(pred_temp, test_data$species)$overall['Accuracy']
  
  gamma_results <- rbind(gamma_results, data.frame(Gamma = gamma_val, Accuracy = acc_temp))
}

# Plot gamma parameter effect
p_gamma_effect <- ggplot(gamma_results, aes(x = log10(Gamma), y = Accuracy)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 3) +
  theme_minimal() +
  labs(title = "Pengaruh Parameter Gamma terhadap Akurasi (RBF SVM)",
       x = "log10(Gamma)", y = "Test Accuracy") +
  scale_x_continuous(breaks = log10(gamma_values), labels = gamma_values)

print(p_gamma_effect)

# Display results table
gamma_results %>%
  mutate(Accuracy = round(Accuracy, 4)) %>%
  kable(caption = "Pengaruh Parameter Gamma pada RBF SVM") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Pengaruh Parameter Gamma pada RBF SVM
Gamma Accuracy
Accuracy 1e-03 0.9538
Accuracy1 1e-02 0.9538
Accuracy2 1e-01 0.9692
Accuracy3 1e+00 0.9385
Accuracy4 1e+01 0.8462

10. Feature Importance Analysis

10.1 Linear SVM Coefficients

# For linear SVM, we can examine the coefficients
w <- t(svm_linear_final$coefs) %*% svm_linear_final$SV

# Create feature importance dataframe
feature_names <- colnames(train_data)[-1]
feature_importance <- data.frame(
  Feature = feature_names,
  Importance = abs(as.numeric(w))
) %>%
  arrange(desc(Importance))

feature_importance %>%
  mutate(Importance = round(Importance, 4)) %>%
  kable(caption = "Feature Importance (Linear SVM)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Feature Importance (Linear SVM)
Feature Importance
bill_length_mm 26.0013
bill_depth_mm 25.2459
flipper_length_mm 18.3424
body_mass_g 17.6635
bill_depth_mm 9.8024
body_mass_g 6.9998
flipper_length_mm 4.7038
bill_length_mm 3.9562
# Visualize feature importance
p_importance <- ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Feature Importance dalam Linear SVM",
       x = "Features", y = "Absolute Coefficient Value")

print(p_importance)

11. Cross-Validation Analysis

11.1 K-Fold Cross Validation

# Perform k-fold cross-validation
set.seed(123)
cv_folds <- 10

# Linear SVM CV
cv_linear <- train(species ~ ., 
                   data = train_data,
                   method = "svmLinear",
                   trControl = trainControl(method = "cv", number = cv_folds),
                   tuneGrid = data.frame(C = tune_linear$best.parameters$cost),
                   preProcess = c("center", "scale"))

# RBF SVM CV
cv_rbf <- train(species ~ ., 
                data = train_data,
                method = "svmRadial",
                trControl = trainControl(method = "cv", number = cv_folds),
                tuneGrid = data.frame(C = tune_rbf$best.parameters$cost,
                                     sigma = tune_rbf$best.parameters$gamma),
                preProcess = c("center", "scale"))

# Results
cv_results <- data.frame(
  Model = c("Linear SVM", "RBF SVM"),
  CV_Accuracy = c(cv_linear$results$Accuracy, cv_rbf$results$Accuracy),
  CV_Std = c(cv_linear$results$AccuracySD, cv_rbf$results$AccuracySD)
)

cv_results %>%
  mutate_if(is.numeric, round, 4) %>%
  kable(caption = "Cross-Validation Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Cross-Validation Results
Model CV_Accuracy CV_Std
Linear SVM 1.0000 0.0000
RBF SVM 0.9926 0.0156

12. Model Interpretability

12.1 Support Vectors Analysis

# Analyze support vectors
cat("=== SUPPORT VECTORS ANALYSIS ===\n")
## === SUPPORT VECTORS ANALYSIS ===
cat("Linear SVM:\n")
## Linear SVM:
cat("Number of Support Vectors:", nrow(svm_linear_final$SV), "\n")
## Number of Support Vectors: 12
cat("Total Training Samples:", nrow(train_data), "\n")
## Total Training Samples: 268
cat("Support Vector Ratio:", round(nrow(svm_linear_final$SV)/nrow(train_data), 4), "\n\n")
## Support Vector Ratio: 0.0448
cat("RBF SVM:\n")
## RBF SVM:
cat("Number of Support Vectors:", nrow(svm_rbf_final$SV), "\n")
## Number of Support Vectors: 16
cat("Total Training Samples:", nrow(train_data), "\n")
## Total Training Samples: 268
cat("Support Vector Ratio:", round(nrow(svm_rbf_final$SV)/nrow(train_data), 4), "\n")
## Support Vector Ratio: 0.0597
# Support vectors by class
sv_by_class_linear <- table(svm_linear_final$fitted)
sv_by_class_rbf <- table(svm_rbf_final$fitted)

sv_comparison <- data.frame(
  Species = names(sv_by_class_linear),
  Linear_SVM_SV = as.numeric(sv_by_class_linear),
  RBF_SVM_SV = as.numeric(sv_by_class_rbf)
)

sv_comparison %>%
  kable(caption = "Support Vectors per Species") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Support Vectors per Species
Species Linear_SVM_SV RBF_SVM_SV
Adelie 117 117
Chinstrap 55 55
Gentoo 96 96

13. Error Analysis

13.1 Misclassification Analysis

# Analyze misclassifications
test_results <- data.frame(
  Actual = test_data$species,
  Linear_Pred = pred_linear_final,
  RBF_Pred = pred_rbf_final,
  Linear_Correct = pred_linear_final == test_data$species,
  RBF_Correct = pred_rbf_final == test_data$species
)

# Count misclassifications
linear_errors <- sum(!test_results$Linear_Correct)
rbf_errors <- sum(!test_results$RBF_Correct)

cat("Linear SVM Misclassifications:", linear_errors, "out of", nrow(test_data), "\n")
## Linear SVM Misclassifications: 3 out of 65
cat("RBF SVM Misclassifications:", rbf_errors, "out of", nrow(test_data), "\n")
## RBF SVM Misclassifications: 2 out of 65
# Show misclassified cases
if(linear_errors > 0) {
  cat("\n=== LINEAR SVM MISCLASSIFICATIONS ===\n")
  misclassified_linear <- test_results[!test_results$Linear_Correct, ]
  print(misclassified_linear)
}
## 
## === LINEAR SVM MISCLASSIFICATIONS ===
##       Actual Linear_Pred  RBF_Pred Linear_Correct RBF_Correct
## 17    Adelie   Chinstrap Chinstrap          FALSE       FALSE
## 56 Chinstrap      Adelie    Adelie          FALSE       FALSE
## 64 Chinstrap      Adelie Chinstrap          FALSE        TRUE
if(rbf_errors > 0) {
  cat("\n=== RBF SVM MISCLASSIFICATIONS ===\n")
  misclassified_rbf <- test_results[!test_results$RBF_Correct, ]
  print(misclassified_rbf)
}
## 
## === RBF SVM MISCLASSIFICATIONS ===
##       Actual Linear_Pred  RBF_Pred Linear_Correct RBF_Correct
## 17    Adelie   Chinstrap Chinstrap          FALSE       FALSE
## 56 Chinstrap      Adelie    Adelie          FALSE       FALSE

14. Kesimpulan dan Interpretasi

14.1 Ringkasan Hasil

Berdasarkan analisis yang telah dilakukan, berikut adalah ringkasan hasil utama:

Performa Model:

  1. Linear SVM: Mencapai akurasi 0.9538 pada test set
  2. RBF SVM: Mencapai akurasi 0.9692 pada test set

Hyperparameter Optimal:

  • Linear SVM: C = 10
  • RBF SVM: C = 100, Gamma = 0.1

14.2 Interpretasi Parameter

Parameter C (Regularization):

  • Fungsi: Mengontrol trade-off antara margin yang lebar dan kesalahan klasifikasi
  • Nilai rendah: Margin lebih lebar, lebih toleran terhadap misklasifikasi (underfit risk)
  • Nilai tinggi: Margin lebih sempit, lebih strict terhadap training data (overfit risk)
  • Optimal: Nilai C = 10 memberikan keseimbangan terbaik

Parameter Gamma (RBF Kernel):

  • Fungsi: Mengontrol seberapa jauh pengaruh satu training sample
  • Nilai rendah: Pengaruh setiap sample meluas jauh (smooth decision boundary)
  • Nilai tinggi: Pengaruh terbatas pada sample terdekat (complex decision boundary)
  • Optimal: Gamma = 0.1 memberikan kompleksitas yang tepat

14.3 Analisis Perbandingan Model

Kelebihan Linear SVM:

  1. Interpretabilitas tinggi: Koefisien dapat diinterpretasikan langsung
  2. Komputasi efisien: Training dan prediksi lebih cepat
  3. Robust terhadap overfitting: Terutama pada dataset kecil
  4. Generalisasi baik: Cocok ketika data linearly separable

Kelebihan RBF SVM:

  1. Fleksibilitas tinggi: Dapat menangani non-linear relationships
  2. Performa superior: Pada dataset complex dengan non-linear patterns
  3. Adaptif: Dapat menyesuaikan dengan berbagai bentuk data distribution

Rekomendasi:

Berdasarkan hasil analisis: - RBF SVM menunjukkan performa yang lebih baik - Dataset penguin memiliki beberapa non-linear relationships yang dapat ditangkap lebih baik oleh RBF kernel - Support vector ratio yang lebih tinggi pada RBF menunjukkan efisiensi model

14.4 Feature Importance Insights

Dari analisis feature importance pada Linear SVM: 1. bill_length_mm adalah fitur paling penting 2. bill_depth_mm menunjukkan kontribusi signifikan kedua 3. Semua fitur berkontribusi dalam klasifikasi, menunjukkan relevansi semua pengukuran morfologi

14.5 Decision Boundary Analysis

Visualisasi decision boundary dalam ruang PCA menunjukkan: - Linear SVM: Membentuk garis pemisah linear yang jelas - RBF SVM: Menghasilkan boundary yang lebih complex dan fleksibel - Separabilitas: Dataset penguin relatif well-separated, menjelaskan performa tinggi kedua model

14.6 Practical Implications

Untuk Penelitian Biologi:

  1. Morfologi sebagai classifier: Physical measurements adalah predictor yang reliable
  2. Species differentiation: Model dapat membantu identifikasi spesies otomatis
  3. Feature selection: Semua pengukuran memberikan informasi yang valuable

Untuk Machine Learning:

  1. Kernel selection: RBF kernel memberikan slight advantage pada dataset ini
  2. Hyperparameter tuning: Critical untuk optimal performance
  3. Cross-validation: Penting untuk generalization assessment

15. Limitasi dan Saran

15.1 Limitasi Penelitian

  1. Dataset size: Relatif small dataset (n=333) dapat membatasi generalizability
  2. Feature engineering: Tidak melakukan feature transformation atau creation
  3. Class imbalance: Slight imbalance dalam distribusi class
  4. Missing data: Beberapa observasi dihapus karena missing values

15.2 Saran Pengembangan

Untuk Penelitian Lanjutan:

  1. Ensemble methods: Kombinasi multiple SVM dengan voting/bagging
  2. Feature engineering: PCA, polynomial features, atau domain-specific transformations
  3. Advanced kernels: Custom kernel atau kernel combinations
  4. Deep learning comparison: Perbandingan dengan neural networks

Untuk Implementasi Praktis:

  1. Real-time classification: Optimisasi untuk deployment
  2. Mobile application: Integration untuk field research
  3. Uncertainty quantification: Confidence intervals untuk predictions
  4. Active learning: Untuk continuous model improvement

16. Referensi dan Sumber

16.1 Dataset

  • Palmer Penguins Dataset: Gorman KB, Williams TD, Fraser WR (2014)
  • Package: palmerpenguins in R

16.2 Metodologi

  • Vapnik, V. (1995). The Nature of Statistical Learning Theory
  • Hastie, T., Tibshirani, R., & Friedman, J. (2009). The Elements of Statistical Learning
  • James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An Introduction to Statistical Learning

16.3 Software

  • R version R version 4.3.0 (2023-04-21)
  • Key packages: e1071, caret, ggplot2, dplyr

Appendix: Session Information

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS 15.3.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Jakarta
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0     knitr_1.50           plotly_4.10.4       
##  [4] corrplot_0.92        viridis_0.6.3        viridisLite_0.4.2   
##  [7] gridExtra_2.3        caret_6.0-94         lattice_0.21-8      
## [10] e1071_1.7-13         lubridate_1.9.2      forcats_1.0.0       
## [13] stringr_1.5.0        dplyr_1.1.4          purrr_1.0.1         
## [16] readr_2.1.4          tidyr_1.3.0          tibble_3.2.1        
## [19] ggplot2_3.5.2        tidyverse_2.0.0      palmerpenguins_0.1.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0     timeDate_4022.108    farver_2.1.1        
##  [4] lazyeval_0.2.2       fastmap_1.1.1        pROC_1.18.4         
##  [7] digest_0.6.31        rpart_4.1.19         timechange_0.2.0    
## [10] lifecycle_1.0.3      survival_3.5-5       kernlab_0.9-32      
## [13] magrittr_2.0.3       compiler_4.3.0       rlang_1.1.1         
## [16] sass_0.4.6           tools_4.3.0          utf8_1.2.3          
## [19] yaml_2.3.7           data.table_1.14.8    labeling_0.4.2      
## [22] htmlwidgets_1.6.2    xml2_1.3.4           plyr_1.8.8          
## [25] withr_2.5.0          nnet_7.3-18          grid_4.3.0          
## [28] stats4_4.3.0         fansi_1.0.4          colorspace_2.1-0    
## [31] future_1.33.0        globals_0.16.2       scales_1.3.0        
## [34] iterators_1.0.14     MASS_7.3-60          cli_3.6.1           
## [37] rmarkdown_2.22       generics_0.1.3       rstudioapi_0.14     
## [40] future.apply_1.11.0  httr_1.4.6           reshape2_1.4.4      
## [43] tzdb_0.4.0           cachem_1.0.8         proxy_0.4-27        
## [46] splines_4.3.0        parallel_4.3.0       vctrs_0.6.4         
## [49] hardhat_1.3.0        Matrix_1.5-4         jsonlite_1.8.4      
## [52] hms_1.1.3            listenv_0.9.0        systemfonts_1.0.4   
## [55] foreach_1.5.2        gower_1.0.1          jquerylib_0.1.4     
## [58] recipes_1.0.8        glue_1.6.2           parallelly_1.36.0   
## [61] codetools_0.2-19     stringi_1.7.12       gtable_0.3.3        
## [64] munsell_0.5.0        pillar_1.9.0         htmltools_0.5.6     
## [67] ipred_0.9-14         lava_1.7.2.1         R6_2.5.1            
## [70] evaluate_0.21        bslib_0.4.2          class_7.3-21        
## [73] Rcpp_1.0.10          svglite_2.1.3        nlme_3.1-166        
## [76] prodlim_2023.08.28   xfun_0.52            pkgconfig_2.0.3     
## [79] ModelMetrics_1.2.2.2

Catatan: Laporan ini dibuat dengan R Markdown untuk memastikan reproducibility dan transparansi dalam analisis. Semua kode dan visualisasi dapat direplikasi dengan menjalankan script yang disediakan.


Laporan Tugas SVM - Klasifikasi Spesies Penguin
Mata Kuliah: Machine Learning
Tanggal: 2025-06-02