1. Pendahuluan

Support Vector Regression (SVR) adalah ekstensi dari Support Vector Machine untuk masalah regresi. SVR menggunakan konsep epsilon-tube yang memungkinkan toleransi error dalam prediksi, berbeda dengan regresi linear biasa yang mencoba meminimalkan semua error.

Konsep Kunci SVR:

  • Epsilon-tube (ε-tube): Area toleransi dimana error tidak dihitung
  • Support Vectors: Data points yang berada di luar epsilon-tube
  • Kernel Functions: Untuk menangani non-linear relationships

Tujuan Analisis:

  • Memahami implementasi SVR linear dan nonlinear
  • Membandingkan performa SVR dengan OLS regression
  • Mengeksplorasi pengaruh parameter epsilon, C, dan gamma
  • Visualisasi epsilon-tube dan support vectors

2. Setup dan Load Library

# Install packages jika belum ada
# install.packages(c("e1071", "caret", "ggplot2", "dplyr", "gridExtra", 
#                    "corrplot", "Metrics", "plotly", "GGally"))

# Load libraries
library(e1071)        # untuk SVR
library(caret)        # untuk machine learning
library(ggplot2)      # untuk visualisasi
library(dplyr)        # untuk manipulasi data
library(gridExtra)    # untuk multiple plots
library(corrplot)     # untuk correlation plot
library(Metrics)      # untuk evaluation metrics
library(knitr)        # untuk kable
library(GGally)       # untuk pair plots
library(plotly)       # untuk interactive plots
library(tidyr)        # untuk data reshaping

3. Dataset dan Eksplorasi Data

Dataset mtcars berisi data tentang performa mobil dengan 32 observasi dan 11 variabel. Kita akan memprediksi konsumsi bahan bakar (mpg) berdasarkan variabel lainnya.

# Load dataset mtcars
data(mtcars)
df <- mtcars

# Tampilkan informasi dasar dataset
cat("Dimensi dataset:", dim(df), "\n")
## Dimensi dataset: 32 11
cat("Struktur dataset:\n")
## Struktur dataset:
str(df)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Deskripsi variabel
cat("Deskripsi Variabel:\n")
## Deskripsi Variabel:
cat("mpg  - Miles per gallon (target variable)\n")
## mpg  - Miles per gallon (target variable)
cat("cyl  - Number of cylinders\n")
## cyl  - Number of cylinders
cat("disp - Displacement (cu.in.)\n")
## disp - Displacement (cu.in.)
cat("hp   - Gross horsepower\n")
## hp   - Gross horsepower
cat("drat - Rear axle ratio\n")
## drat - Rear axle ratio
cat("wt   - Weight (1000 lbs)\n")
## wt   - Weight (1000 lbs)
cat("qsec - 1/4 mile time\n")
## qsec - 1/4 mile time
cat("vs   - Engine (0=V-shaped, 1=straight)\n")
## vs   - Engine (0=V-shaped, 1=straight)
cat("am   - Transmission (0=automatic, 1=manual)\n")
## am   - Transmission (0=automatic, 1=manual)
cat("gear - Number of forward gears\n")
## gear - Number of forward gears
cat("carb - Number of carburetors\n")
## carb - Number of carburetors
# Summary statistik
summary(df)
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000
# Cek missing values
cat("Missing values per kolom:\n")
## Missing values per kolom:
colSums(is.na(df))
##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##    0    0    0    0    0    0    0    0    0    0    0
cat("\nTidak ada missing values dalam dataset")
## 
## Tidak ada missing values dalam dataset

3.1 Visualisasi Eksplorasi Data

# Distribusi target variable (mpg)
ggplot(df, aes(x = mpg)) +
  geom_histogram(bins = 15, fill = "skyblue", alpha = 0.7, color = "black") +
  geom_density(aes(y = ..density.. * nrow(df) * 2), color = "red", size = 1) +
  labs(title = "Distribusi Miles per Gallon (MPG)",
       x = "Miles per Gallon",
       y = "Frequency") +
  theme_minimal()

# Correlation matrix
cor_matrix <- cor(df)
corrplot(cor_matrix, method = "color", type = "upper", 
         order = "hclust", tl.col = "black", tl.srt = 45,
         title = "Korelasi Antar Variabel")

# Pairs plot untuk melihat hubungan antar variabel
# Pilih beberapa variabel penting
important_vars <- c("mpg", "wt", "hp", "disp", "cyl")
ggpairs(df[important_vars], 
        title = "Scatter Plot Matrix - Variabel Penting") +
  theme_minimal()

# Plot hubungan key variables dengan mpg
p1 <- ggplot(df, aes(x = wt, y = mpg)) +
  geom_point(size = 3, alpha = 0.7, color = "blue") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "MPG vs Weight", x = "Weight (1000 lbs)", y = "MPG") +
  theme_minimal()

p2 <- ggplot(df, aes(x = hp, y = mpg)) +
  geom_point(size = 3, alpha = 0.7, color = "green") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "MPG vs Horsepower", x = "Horsepower", y = "MPG") +
  theme_minimal()

p3 <- ggplot(df, aes(x = disp, y = mpg)) +
  geom_point(size = 3, alpha = 0.7, color = "purple") +
  geom_smooth(method = "lm", se = TRUE, color = "red") +
  labs(title = "MPG vs Displacement", x = "Displacement", y = "MPG") +
  theme_minimal()

p4 <- ggplot(df, aes(x = factor(cyl), y = mpg)) +
  geom_boxplot(fill = "orange", alpha = 0.7) +
  labs(title = "MPG vs Cylinders", x = "Number of Cylinders", y = "MPG") +
  theme_minimal()

grid.arrange(p1, p2, p3, p4, ncol = 2)

4. Preprocessing Data

# Set seed untuk reproducibility
set.seed(123)

# Karena dataset kecil (32 observasi), kita gunakan 70-30 split
train_index <- createDataPartition(df$mpg, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

cat("Ukuran data training:", nrow(train_data), "\n")
## Ukuran data training: 24
cat("Ukuran data testing:", nrow(test_data), "\n")
## Ukuran data testing: 8
# Statistik target variable
cat("\nStatistik MPG - Training:\n")
## 
## Statistik MPG - Training:
cat("Mean:", round(mean(train_data$mpg), 2), "\n")
## Mean: 20.23
cat("SD:", round(sd(train_data$mpg), 2), "\n")
## SD: 6.19
cat("Range:", round(range(train_data$mpg), 2), "\n")
## Range: 10.4 33.9
cat("\nStatistik MPG - Testing:\n")
## 
## Statistik MPG - Testing:
cat("Mean:", round(mean(test_data$mpg), 2), "\n")
## Mean: 19.68
cat("SD:", round(sd(test_data$mpg), 2), "\n")
## SD: 5.89
cat("Range:", round(range(test_data$mpg), 2), "\n")
## Range: 14.3 30.4
# Feature scaling untuk SVR
# Simpan parameter scaling dari training data
scale_params <- list()
for(col in names(train_data)) {
  if(col != "mpg") {
    scale_params[[col]] <- list(
      center = mean(train_data[[col]]),
      scale = sd(train_data[[col]])
    )
  }
}

# Apply scaling
train_scaled <- train_data
test_scaled <- test_data

for(col in names(scale_params)) {
  train_scaled[[col]] <- scale(train_data[[col]])[,1]
  test_scaled[[col]] <- (test_data[[col]] - scale_params[[col]]$center) / scale_params[[col]]$scale
}

# Target variable tidak di-scale untuk interpretasi yang mudah
cat("Data berhasil di-scale. Target variable (mpg) tetap dalam skala asli.\n")
## Data berhasil di-scale. Target variable (mpg) tetap dalam skala asli.

5. Model Baseline: OLS (Ordinary Least Squares)

# Model OLS untuk perbandingan
ols_model <- lm(mpg ~ ., data = train_data)

# Summary model
summary(ols_model)
## 
## Call:
## lm(formula = mpg ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8335 -1.2145 -0.0044  1.1104  4.1136 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.72655   27.33755  -0.539    0.599
## cyl           1.33019    1.41806   0.938    0.365
## disp          0.01218    0.01887   0.645    0.530
## hp           -0.01756    0.02588  -0.678    0.509
## drat          2.67080    2.51575   1.062    0.308
## wt           -3.65531    2.37988  -1.536    0.149
## qsec          1.02957    0.86524   1.190    0.255
## vs           -0.37848    3.00566  -0.126    0.902
## am            1.82482    2.74395   0.665    0.518
## gear          3.74182    2.99973   1.247    0.234
## carb         -1.42190    1.42397  -0.999    0.336
## 
## Residual standard error: 2.641 on 13 degrees of freedom
## Multiple R-squared:  0.8971, Adjusted R-squared:  0.818 
## F-statistic: 11.33 on 10 and 13 DF,  p-value: 6.948e-05
# Prediksi OLS
pred_ols_train <- predict(ols_model, train_data)
pred_ols_test <- predict(ols_model, test_data)

# Evaluasi OLS
ols_train_rmse <- rmse(train_data$mpg, pred_ols_train)
ols_train_mae <- mae(train_data$mpg, pred_ols_train)
ols_train_r2 <- cor(train_data$mpg, pred_ols_train)^2

ols_test_rmse <- rmse(test_data$mpg, pred_ols_test)
ols_test_mae <- mae(test_data$mpg, pred_ols_test)
ols_test_r2 <- cor(test_data$mpg, pred_ols_test)^2

cat("=== OLS Performance ===\n")
## === OLS Performance ===
cat("Training - RMSE:", round(ols_train_rmse, 3), "MAE:", round(ols_train_mae, 3), "R²:", round(ols_train_r2, 3), "\n")
## Training - RMSE: 1.944 MAE: 1.527 R²: 0.897
cat("Testing  - RMSE:", round(ols_test_rmse, 3), "MAE:", round(ols_test_mae, 3), "R²:", round(ols_test_r2, 3), "\n")
## Testing  - RMSE: 3.989 MAE: 2.844 R²: 0.658

6. Model SVR Linear

# Training SVR Linear
svr_linear <- svm(mpg ~ ., 
                  data = train_scaled,
                  type = "eps-regression",
                  kernel = "linear",
                  cost = 1,
                  epsilon = 0.1,
                  scale = FALSE)  # sudah di-scale manual

# Summary model
summary(svr_linear)
## 
## Call:
## svm(formula = mpg ~ ., data = train_scaled, type = "eps-regression", 
##     kernel = "linear", cost = 1, epsilon = 0.1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.1 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  24
# Prediksi SVR Linear
pred_svr_linear_train <- predict(svr_linear, train_scaled)
pred_svr_linear_test <- predict(svr_linear, test_scaled)

# Evaluasi SVR Linear
svr_linear_train_rmse <- rmse(train_data$mpg, pred_svr_linear_train)
svr_linear_train_mae <- mae(train_data$mpg, pred_svr_linear_train)
svr_linear_train_r2 <- cor(train_data$mpg, pred_svr_linear_train)^2

svr_linear_test_rmse <- rmse(test_data$mpg, pred_svr_linear_test)
svr_linear_test_mae <- mae(test_data$mpg, pred_svr_linear_test)
svr_linear_test_r2 <- cor(test_data$mpg, pred_svr_linear_test)^2

cat("=== SVR Linear Performance ===\n")
## === SVR Linear Performance ===
cat("Training - RMSE:", round(svr_linear_train_rmse, 3), "MAE:", round(svr_linear_train_mae, 3), "R²:", round(svr_linear_train_r2, 3), "\n")
## Training - RMSE: 2.25 MAE: 1.601 R²: 0.877
cat("Testing  - RMSE:", round(svr_linear_test_rmse, 3), "MAE:", round(svr_linear_test_mae, 3), "R²:", round(svr_linear_test_r2, 3), "\n")
## Testing  - RMSE: 2.913 MAE: 2.328 R²: 0.776
cat("Support Vectors:", svr_linear$tot.nSV, "\n")
## Support Vectors: 24

7. Model SVR Nonlinear (RBF Kernel)

# Training SVR dengan RBF kernel
svr_rbf <- svm(mpg ~ ., 
               data = train_scaled,
               type = "eps-regression",
               kernel = "radial",
               cost = 1,
               epsilon = 0.1,
               gamma = 0.1,
               scale = FALSE)

# Summary model
summary(svr_rbf)
## 
## Call:
## svm(formula = mpg ~ ., data = train_scaled, type = "eps-regression", 
##     kernel = "radial", cost = 1, epsilon = 0.1, gamma = 0.1, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.1 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  24
# Prediksi SVR RBF
pred_svr_rbf_train <- predict(svr_rbf, train_scaled)
pred_svr_rbf_test <- predict(svr_rbf, test_scaled)

# Evaluasi SVR RBF
svr_rbf_train_rmse <- rmse(train_data$mpg, pred_svr_rbf_train)
svr_rbf_train_mae <- mae(train_data$mpg, pred_svr_rbf_train)
svr_rbf_train_r2 <- cor(train_data$mpg, pred_svr_rbf_train)^2

svr_rbf_test_rmse <- rmse(test_data$mpg, pred_svr_rbf_test)
svr_rbf_test_mae <- mae(test_data$mpg, pred_svr_rbf_test)
svr_rbf_test_r2 <- cor(test_data$mpg, pred_svr_rbf_test)^2

cat("=== SVR RBF Performance ===\n")
## === SVR RBF Performance ===
cat("Training - RMSE:", round(svr_rbf_train_rmse, 3), "MAE:", round(svr_rbf_train_mae, 3), "R²:", round(svr_rbf_train_r2, 3), "\n")
## Training - RMSE: 4.186 MAE: 2.946 R²: 0.788
cat("Testing  - RMSE:", round(svr_rbf_test_rmse, 3), "MAE:", round(svr_rbf_test_mae, 3), "R²:", round(svr_rbf_test_r2, 3), "\n")
## Testing  - RMSE: 4.123 MAE: 3.177 R²: 0.82
cat("Support Vectors:", svr_rbf$tot.nSV, "\n")
## Support Vectors: 24

8. Hyperparameter Tuning untuk SVR RBF

# Grid search untuk SVR RBF
tune_result <- tune(svm, mpg ~ ., 
                    data = train_scaled,
                    type = "eps-regression",
                    kernel = "radial",
                    ranges = list(
                      cost = c(0.1, 1, 10, 100),
                      epsilon = c(0.01, 0.1, 0.2, 0.5),
                      gamma = c(0.01, 0.1, 0.5, 1)
                    ),
                    tunecontrol = tune.control(cross = 5))

# Best parameters
print(tune_result)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  cost epsilon gamma
##   100     0.5  0.01
## 
## - best performance: 6.517972
best_params <- tune_result$best.parameters
cat("\nBest parameters:\n")
## 
## Best parameters:
cat("Cost:", best_params$cost, "\n")
## Cost: 100
cat("Epsilon:", best_params$epsilon, "\n")
## Epsilon: 0.5
cat("Gamma:", best_params$gamma, "\n")
## Gamma: 0.01
# Training model dengan parameter terbaik
svr_best <- svm(mpg ~ ., 
                data = train_scaled,
                type = "eps-regression",
                kernel = "radial",
                cost = best_params$cost,
                epsilon = best_params$epsilon,
                gamma = best_params$gamma,
                scale = FALSE)

# Prediksi dengan model terbaik
pred_svr_best_train <- predict(svr_best, train_scaled)
pred_svr_best_test <- predict(svr_best, test_scaled)

# Evaluasi model terbaik
svr_best_train_rmse <- rmse(train_data$mpg, pred_svr_best_train)
svr_best_train_mae <- mae(train_data$mpg, pred_svr_best_train)
svr_best_train_r2 <- cor(train_data$mpg, pred_svr_best_train)^2

svr_best_test_rmse <- rmse(test_data$mpg, pred_svr_best_test)
svr_best_test_mae <- mae(test_data$mpg, pred_svr_best_test)
svr_best_test_r2 <- cor(test_data$mpg, pred_svr_best_test)^2

cat("=== SVR Best Model Performance ===\n")
## === SVR Best Model Performance ===
cat("Training - RMSE:", round(svr_best_train_rmse, 3), "MAE:", round(svr_best_train_mae, 3), "R²:", round(svr_best_train_r2, 3), "\n")
## Training - RMSE: 1.912 MAE: 1.327 R²: 0.91
cat("Testing  - RMSE:", round(svr_best_test_rmse, 3), "MAE:", round(svr_best_test_mae, 3), "R²:", round(svr_best_test_r2, 3), "\n")
## Testing  - RMSE: 2.696 MAE: 2.135 R²: 0.78
cat("Support Vectors:", svr_best$tot.nSV, "\n")
## Support Vectors: 23

9. Visualisasi Epsilon-Tube dan Support Vectors

Untuk visualisasi epsilon-tube, kita akan menggunakan hubungan 1D antara weight (wt) dan mpg.

# Buat dataset 1D untuk visualisasi epsilon-tube
train_1d <- data.frame(
  wt = train_scaled$wt,
  mpg = train_data$mpg
)

test_1d <- data.frame(
  wt = test_scaled$wt,
  mpg = test_data$mpg
)

# Model SVR 1D untuk visualisasi
svr_1d <- svm(mpg ~ wt, 
              data = train_1d,
              type = "eps-regression",
              kernel = "radial",
              cost = best_params$cost,
              epsilon = best_params$epsilon,
              gamma = best_params$gamma)

cat("Support Vectors untuk model 1D:", svr_1d$tot.nSV, "\n")
## Support Vectors untuk model 1D: 8
# Fungsi untuk visualisasi epsilon-tube
visualize_epsilon_tube <- function(model, train_data, epsilon, title) {
  # Create prediction grid
  wt_range <- seq(min(train_data$wt) - 0.5, max(train_data$wt) + 0.5, length.out = 100)
  grid_data <- data.frame(wt = wt_range)
  
  # Predictions
  pred_grid <- predict(model, grid_data)
  
  # Identify support vectors
  sv_indices <- model$index
  sv_data <- train_data[sv_indices, ]
  non_sv_data <- train_data[-sv_indices, ]
  
  # Create plot
  p <- ggplot() +
    # Epsilon tube
    geom_ribbon(data = data.frame(wt = wt_range, pred = pred_grid),
                aes(x = wt, ymin = pred - epsilon, ymax = pred + epsilon),
                alpha = 0.2, fill = "gray") +
    # Regression line
    geom_line(data = data.frame(wt = wt_range, pred = pred_grid),
              aes(x = wt, y = pred), color = "red", size = 1.2) +
    # Support vectors
    geom_point(data = sv_data, aes(x = wt, y = mpg), 
               color = "red", size = 4, shape = 1, stroke = 2) +
    # Non-support vectors
    geom_point(data = non_sv_data, aes(x = wt, y = mpg), 
               color = "blue", size = 3, alpha = 0.7) +
    # Epsilon boundaries
    geom_line(data = data.frame(wt = wt_range, pred = pred_grid),
              aes(x = wt, y = pred + epsilon), 
              linetype = "dashed", color = "gray40") +
    geom_line(data = data.frame(wt = wt_range, pred = pred_grid),
              aes(x = wt, y = pred - epsilon), 
              linetype = "dashed", color = "gray40") +
    labs(title = title,
         x = "Weight (scaled)",
         y = "Miles per Gallon",
         subtitle = paste("Epsilon =", epsilon, "| Support Vectors =", length(sv_indices))) +
    theme_minimal() +
    theme(legend.position = "bottom")
  
  return(p)
}

# Plot epsilon-tube
epsilon_plot <- visualize_epsilon_tube(svr_1d, train_1d, best_params$epsilon, 
                                      "SVR Epsilon-Tube Visualization")
print(epsilon_plot)

# Analisis residuals untuk melihat support vectors
train_1d$predicted <- predict(svr_1d, train_1d)
train_1d$residuals <- train_1d$mpg - train_1d$predicted
train_1d$abs_residuals <- abs(train_1d$residuals)
train_1d$is_sv <- 1:nrow(train_1d) %in% svr_1d$index

# Plot residuals
ggplot(train_1d, aes(x = predicted, y = residuals, color = is_sv, size = is_sv)) +
  geom_hline(yintercept = c(-best_params$epsilon, best_params$epsilon), 
             linetype = "dashed", color = "gray40") +
  geom_hline(yintercept = 0, color = "red") +
  geom_point(alpha = 0.7) +
  scale_color_manual(values = c("FALSE" = "blue", "TRUE" = "red"), 
                     labels = c("Non-Support Vector", "Support Vector")) +
  scale_size_manual(values = c("FALSE" = 2, "TRUE" = 4), guide = "none") +
  labs(title = "Residuals Analysis - Support Vectors Identification",
       x = "Predicted MPG",
       y = "Residuals",
       color = "Point Type",
       subtitle = paste("Points outside epsilon-tube (±", best_params$epsilon, ") become support vectors")) +
  theme_minimal()

10. Eksplorasi Parameter

10.1 Pengaruh Parameter Epsilon

# Test berbagai nilai epsilon
epsilon_values <- c(0.01, 0.05, 0.1, 0.2, 0.5, 1.0)
epsilon_results <- data.frame(
  Epsilon = epsilon_values,
  RMSE_Train = numeric(length(epsilon_values)),
  RMSE_Test = numeric(length(epsilon_values)),
  Support_Vectors = numeric(length(epsilon_values))
)

for(i in 1:length(epsilon_values)) {
  # Model dengan epsilon berbeda
  model_eps <- svm(mpg ~ ., data = train_scaled, 
                   type = "eps-regression", kernel = "radial",
                   cost = best_params$cost, gamma = best_params$gamma,
                   epsilon = epsilon_values[i], scale = FALSE)
  
  # Evaluasi
  pred_train <- predict(model_eps, train_scaled)
  pred_test <- predict(model_eps, test_scaled)
  
  epsilon_results$RMSE_Train[i] <- rmse(train_data$mpg, pred_train)
  epsilon_results$RMSE_Test[i] <- rmse(test_data$mpg, pred_test)
  epsilon_results$Support_Vectors[i] <- model_eps$tot.nSV
}

print(epsilon_results)
##   Epsilon RMSE_Train RMSE_Test Support_Vectors
## 1    0.01   2.001340  2.883725              24
## 2    0.05   1.992592  2.900373              24
## 3    0.10   1.979962  2.889359              24
## 4    0.20   1.957760  2.866061              24
## 5    0.50   1.911806  2.696299              23
## 6    1.00   1.918997  2.376505              17
# Plot pengaruh parameter epsilon
eps_long <- epsilon_results %>%
  gather(key = "Metric", value = "Value", RMSE_Train, RMSE_Test)

p1 <- ggplot(eps_long, aes(x = Epsilon, y = Value, color = Metric)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  labs(title = "Pengaruh Parameter Epsilon terhadap RMSE",
       x = "Epsilon",
       y = "RMSE",
       color = "Dataset") +
  theme_minimal()

p2 <- ggplot(epsilon_results, aes(x = Epsilon, y = Support_Vectors)) +
  geom_line(size = 1.2, color = "darkgreen") +
  geom_point(size = 3, color = "darkgreen") +
  labs(title = "Pengaruh Parameter Epsilon terhadap Jumlah Support Vectors",
       x = "Epsilon",
       y = "Jumlah Support Vectors") +
  theme_minimal()

grid.arrange(p1, p2, ncol = 2)

10.2 Pengaruh Parameter C (Cost)

# Test berbagai nilai C
cost_values <- c(0.01, 0.1, 1, 10, 100, 1000)
cost_results <- data.frame(
  Cost = cost_values,
  RMSE_Train = numeric(length(cost_values)),
  RMSE_Test = numeric(length(cost_values)),
  Support_Vectors = numeric(length(cost_values))
)

for(i in 1:length(cost_values)) {
  # Model dengan cost berbeda
  model_cost <- svm(mpg ~ ., data = train_scaled, 
                    type = "eps-regression", kernel = "radial",
                    cost = cost_values[i], gamma = best_params$gamma,
                    epsilon = best_params$epsilon, scale = FALSE)
  
  # Evaluasi
  pred_train <- predict(model_cost, train_scaled)
  pred_test <- predict(model_cost, test_scaled)
  
  cost_results$RMSE_Train[i] <- rmse(train_data$mpg, pred_train)
  cost_results$RMSE_Test[i] <- rmse(test_data$mpg, pred_test)
  cost_results$Support_Vectors[i] <- model_cost$tot.nSV
}

print(cost_results)
##    Cost RMSE_Train RMSE_Test Support_Vectors
## 1 1e-02   6.092428  5.498389              22
## 2 1e-01   5.937176  5.353396              22
## 3 1e+00   4.749888  4.222613              22
## 4 1e+01   2.608767  2.108564              22
## 5 1e+02   1.911806  2.696299              23
## 6 1e+03   1.420705  4.905910              21
# Plot pengaruh parameter C
cost_long <- cost_results %>%
  gather(key = "Metric", value = "Value", RMSE_Train, RMSE_Test)

p1 <- ggplot(cost_long, aes(x = log10(Cost), y = Value, color = Metric)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  labs(title = "Pengaruh Parameter C terhadap RMSE",
       x = "log10(C)",
       y = "RMSE",
       color = "Dataset") +
  theme_minimal() +
  scale_x_continuous(breaks = log10(cost_values), labels = cost_values)

p2 <- ggplot(cost_results, aes(x = log10(Cost), y = Support_Vectors)) +
  geom_line(size = 1.2, color = "purple") +
  geom_point(size = 3, color = "purple") +
  labs(title = "Pengaruh Parameter C terhadap Jumlah Support Vectors",
       x = "log10(C)",
       y = "Jumlah Support Vectors") +
  theme_minimal() +
  scale_x_continuous(breaks = log10(cost_values), labels = cost_values)

grid.arrange(p1, p2, ncol = 2)

10.3 Pengaruh Parameter Gamma

# Test berbagai nilai gamma
gamma_values <- c(0.001, 0.01, 0.1, 0.5, 1, 2, 5)
gamma_results <- data.frame(
  Gamma = gamma_values,
  RMSE_Train = numeric(length(gamma_values)),
  RMSE_Test = numeric(length(gamma_values)),
  Support_Vectors = numeric(length(gamma_values))
)

for(i in 1:length(gamma_values)) {
  # Model dengan gamma berbeda
  model_gamma <- svm(mpg ~ ., data = train_scaled, 
                     type = "eps-regression", kernel = "radial",
                     cost = best_params$cost, gamma = gamma_values[i],
                     epsilon = best_params$epsilon, scale = FALSE)
  
  # Evaluasi
  pred_train <- predict(model_gamma, train_scaled)
  pred_test <- predict(model_gamma, test_scaled)
  
  gamma_results$RMSE_Train[i] <- rmse(train_data$mpg, pred_train)
  gamma_results$RMSE_Test[i] <- rmse(test_data$mpg, pred_test)
  gamma_results$Support_Vectors[i] <- model_gamma$tot.nSV
}

print(gamma_results)
##   Gamma RMSE_Train RMSE_Test Support_Vectors
## 1 0.001  2.5070875  2.055603              23
## 2 0.010  1.9118058  2.696299              23
## 3 0.100  0.6394558  5.765392              23
## 4 0.500  0.4999488  5.002868              24
## 5 1.000  0.4856315  4.781565              21
## 6 2.000  0.4940382  4.974310              22
## 7 5.000  0.4895685  5.365265              23
# Plot pengaruh parameter gamma
gamma_long <- gamma_results %>%
  gather(key = "Metric", value = "Value", RMSE_Train, RMSE_Test)

p1 <- ggplot(gamma_long, aes(x = log10(Gamma), y = Value, color = Metric)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  labs(title = "Pengaruh Parameter Gamma terhadap RMSE",
       x = "log10(Gamma)",
       y = "RMSE",
       color = "Dataset") +
  theme_minimal() +
  scale_x_continuous(breaks = log10(gamma_values), labels = gamma_values)

p2 <- ggplot(gamma_results, aes(x = log10(Gamma), y = Support_Vectors)) +
  geom_line(size = 1)