Tujuan: Menganalisis dan membandingkan performa dari SVM dan SVR dengan pembandingnya masing-masing
Dataset: Breast Cancer Wisconsin (699 sampel, 9 fitur) untuk SVM dan Air Quality (153 observasi, 6 variabel) untuk SVR
Metode: Cross-validation, Grid Search, ROC Analysis (SVM) serta SVR Linear, SVR RBF, OLS Regression, Parameter Tuning (SVR)
Support Vector Machine (SVM) adalah algoritma pembelajaran mesin yang digunakan untuk klasifikasi data. Prinsip kerja SVM adalah dengan mencari sebuah garis atau hyperplane optimal yang memisahkan kelas-kelas dalam data, sambil memaksimalkan jarak atau margin antar kelas tersebut dalam ruang berdimensi-N.
# Create synthetic 2D data for SVM illustration
set.seed(42)
n <- 50
# Generate two classes
class1 <- data.frame(
x1 = rnorm(n, mean = 2, sd = 0.8),
x2 = rnorm(n, mean = 2, sd = 0.8),
class = "Class A"
)
class2 <- data.frame(
x1 = rnorm(n, mean = 4, sd = 0.8),
x2 = rnorm(n, mean = 4, sd = 0.8),
class = "Class B"
)
svm_data <- rbind(class1, class2)
# Create the conceptual SVM plot
p_concept <- ggplot(svm_data, aes(x = x1, y = x2, color = class, shape = class)) +
geom_point(size = 4, alpha = 0.8) +
# Hyperplane
geom_abline(intercept = 0, slope = 1, color = "black", size = 1.2, linetype = "solid") +
# Margin boundaries
geom_abline(intercept = -0.5, slope = 1, color = "gray40", size = 0.8, linetype = "dashed") +
geom_abline(intercept = 0.5, slope = 1, color = "gray40", size = 0.8, linetype = "dashed") +
# Support vectors (highlighted)
geom_point(data = data.frame(x1 = c(2.5, 3.5), x2 = c(2.5, 3.5),
class = c("Class A", "Class B")),
aes(x = x1, y = x2), size = 6, color = "red", shape = 1, stroke = 2) +
scale_color_manual(values = c("Class A" = "#3498db", "Class B" = "#e74c3c")) +
scale_shape_manual(values = c("Class A" = 16, "Class B" = 17)) +
labs(
title = "Konsep Dasar Support Vector Machine",
subtitle = "Hyperplane Optimal dengan Margin Maksimal",
x = "Feature 1 (x₁)",
y = "Feature 2 (x₂)",
color = "Kelas",
shape = "Kelas"
) +
annotate("text", x = 1.5, y = 4, label = "Hyperplane\n(Decision Boundary)",
color = "black", fontface = "bold", size = 4) +
annotate("text", x = 1, y = 3, label = "Margin", color = "gray40",
fontface = "bold", size = 4) +
annotate("text", x = 4.5, y = 2, label = "Support\nVectors", color = "red",
fontface = "bold", size = 4) +
coord_fixed(ratio = 1) +
theme_publication()
print(p_concept)Gambar 1: Ilustrasi Konsep Dasar SVM - Hyperplane, Margin, dan Support Vectors
Batas keputusan yang memisahkan kelas-kelas data dalam ruang fitur
Jarak antara hyperplane dengan data point terdekat dari setiap kelas
Titik-titik data yang berada pada batas margin dan menentukan hyperplane
# Generate non-linear data for kernel demonstration
set.seed(123)
n <- 100
# Generate circular data
theta <- runif(n, 0, 2*pi)
r1 <- runif(n/2, 0.5, 1.5)
r2 <- runif(n/2, 2.5, 3.5)
circle_data <- data.frame(
x1 = c(r1 * cos(theta[1:(n/2)]), r2 * cos(theta[(n/2+1):n])),
x2 = c(r1 * sin(theta[1:(n/2)]), r2 * sin(theta[(n/2+1):n])),
class = rep(c("Inner", "Outer"), each = n/2)
)
# Create kernel comparison plot
p_kernel <- ggplot(circle_data, aes(x = x1, y = x2, color = class, shape = class)) +
geom_point(size = 3, alpha = 0.8) +
scale_color_manual(values = c("Inner" = "#2ecc71", "Outer" = "#e74c3c")) +
scale_shape_manual(values = c("Inner" = 16, "Outer" = 17)) +
labs(
title = "Data Non-Linear: Membutuhkan Kernel RBF",
subtitle = "Linear kernel tidak dapat memisahkan data circular",
x = "Feature 1 (x₁)",
y = "Feature 2 (x₂)",
color = "Kelas",
shape = "Kelas"
) +
coord_fixed(ratio = 1) +
theme_publication()
# Kernel function visualization
x <- seq(-3, 3, length.out = 100)
kernel_data <- data.frame(
x = rep(x, 3),
y = c(
pmax(0, 1 - abs(x)), # Linear kernel (simplified)
exp(-x^2), # RBF kernel
(1 + x)^2 # Polynomial kernel (degree 2)
),
kernel = rep(c("Linear", "RBF (Gaussian)", "Polynomial"), each = 100)
)
p_kernel_func <- ggplot(kernel_data, aes(x = x, y = y, color = kernel)) +
geom_line(size = 1.5) +
scale_color_manual(values = c("Linear" = "#3498db", "RBF (Gaussian)" = "#e74c3c", "Polynomial" = "#2ecc71")) +
labs(
title = "Kernel Functions",
subtitle = "Berbagai fungsi kernel untuk transformasi data",
x = "Input (x)",
y = "Kernel Output K(x,x')",
color = "Jenis Kernel"
) +
theme_publication()
grid.arrange(p_kernel, p_kernel_func, ncol = 2)Gambar 2: Perbandingan Kernel Functions pada Data Non-Linear
library(kableExtra)
kernel_comparison <- data.frame(
Kernel = c("Linear", "RBF (Radial Basis Function)", "Polynomial", "Sigmoid"),
Formula = c("K(x,x') = x·x'", "K(x,x') = exp(-γ||x-x'||²)", "K(x,x') = (γx·x' + r)^d", "K(x,x') = tanh(γx·x' + r)"),
`Kasus Penggunaan` = c(
"Data linear separable, interpretasi mudah",
"Data non-linear, paling populer",
"Data dengan interaksi fitur",
"Neural network-like"
),
`Parameter Utama` = c("C", "C, γ (gamma)", "C, γ, d, r", "C, γ, r")
)
kable(kernel_comparison,
caption = "**Tabel 1:** Perbandingan Jenis Kernel SVM",
align = "l") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center",
font_size = 12) %>% # font_size dipindah ke sini
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
column_spec(2, monospace = TRUE) %>% # hapus font_size dari sini
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Kernel | Formula | Kasus.Penggunaan | Parameter.Utama |
|---|---|---|---|
| Linear | K(x,x’) = x·x’ | Data linear separable, interpretasi mudah | C |
| RBF (Radial Basis Function) | K(x,x’) = exp(-γ||x-x’||²) | Data non-linear, paling populer | C, γ (gamma) |
| Polynomial | K(x,x’) = (γx·x’ + r)^d | Data dengan interaksi fitur | C, γ, d, r |
| Sigmoid | K(x,x’) = tanh(γx·x’ + r) | Neural network-like | C, γ, r |
# Simulate the effect of C parameter
set.seed(42)
n <- 40
sim_data <- data.frame(
x1 = c(rnorm(n/2, 1, 0.5), rnorm(n/2, 2, 0.5)),
x2 = c(rnorm(n/2, 1, 0.5), rnorm(n/2, 2, 0.5)),
class = rep(c("A", "B"), each = n/2)
)
# Add some noise/outliers
sim_data[c(5, 25), "x1"] <- c(2.5, 0.5)
sim_data[c(5, 25), "x2"] <- c(0.5, 2.5)
# Create plots showing effect of C
p_c_low <- ggplot(sim_data, aes(x = x1, y = x2, color = class)) +
geom_point(size = 4, alpha = 0.8) +
geom_smooth(method = "lm", se = FALSE, size = 1.2, alpha = 0.7) +
scale_color_manual(values = c("A" = "#3498db", "B" = "#e74c3c")) +
labs(title = "C Rendah (C = 0.1)", subtitle = "Boundary lebih smooth, toleran outlier") +
theme_publication() +
theme(legend.position = "none")
p_c_high <- ggplot(sim_data, aes(x = x1, y = x2, color = class)) +
geom_point(size = 4, alpha = 0.8) +
geom_smooth(method = "loess", se = FALSE, size = 1.2, alpha = 0.7, span = 0.3) +
scale_color_manual(values = c("A" = "#3498db", "B" = "#e74c3c")) +
labs(title = "C Tinggi (C = 100)", subtitle = "Boundary kompleks, fit semua data") +
theme_publication() +
theme(legend.position = "none")
grid.arrange(p_c_low, p_c_high, ncol = 2,
top = textGrob("Pengaruh Parameter C terhadap Complexity",
gp = gpar(fontsize = 16, fontface = "bold")))Gambar 3: Pengaruh Parameter C dan Gamma terhadap Decision Boundary
## 📦 Loading libraries...
library(e1071) # SVM implementation
library(caret) # Machine learning tools
library(ggplot2) # Data visualization
library(gridExtra) # Arrange plots
library(corrplot) # Correlation plot
library(dplyr) # Data manipulation
library(ROCR) # ROC curves
library(plotly) # Interactive plots
library(viridis) # Color schemes
library(RColorBrewer) # Color palettes
library(kableExtra) # Beautiful tables
library(DT) # Interactive tables
# Set seed for reproducibility
set.seed(123)
cat("✅ Libraries loaded successfully!\n")## ✅ Libraries loaded successfully!
# Load required library for dataset
library(mlbench)
library(DT)
# Load Breast Cancer Wisconsin dataset
data(BreastCancer, package = "mlbench")
# Display dataset information
cat("📊 Dataset Information:\n")## 📊 Dataset Information:
## - Jumlah sampel: 699
## - Jumlah fitur: 10
## - Jumlah kelas: 2
# Create interactive data table
DT::datatable(head(BreastCancer, 10),
caption = "Tabel 2: Sample Data Breast Cancer Wisconsin",
options = list(scrollX = TRUE, pageLength = 5))# Missing values analysis - PERBAIKAN: ganti gather() dengan pivot_longer()
missing_data <- BreastCancer %>%
summarise_all(~sum(is.na(.))) %>%
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Missing_Count") %>%
mutate(Missing_Percentage = (Missing_Count / nrow(BreastCancer)) * 100)
p_missing <- ggplot(missing_data, aes(x = reorder(Variable, Missing_Count), y = Missing_Count)) +
geom_col(fill = "#e74c3c", alpha = 0.7) +
geom_text(aes(label = paste0(Missing_Count, " (", round(Missing_Percentage, 1), "%)")),
hjust = -0.1, size = 3) +
coord_flip() +
labs(title = "Missing Values Analysis",
x = "Variables", y = "Missing Count") +
theme_publication()
# Class distribution
p_class <- BreastCancer %>%
count(Class) %>%
mutate(Percentage = round(n/sum(n)*100, 1)) %>%
ggplot(aes(x = Class, y = n, fill = Class)) +
geom_col(alpha = 0.8, width = 0.6) +
geom_text(aes(label = paste0(n, "\n(", Percentage, "%)")),
vjust = 1.2, color = "white", fontface = "bold", size = 4) +
scale_fill_manual(values = c("benign" = "#2ecc71", "malignant" = "#e74c3c")) +
labs(title = "Class Distribution",
subtitle = "Benign vs Malignant Cases",
x = "Class", y = "Count") +
theme_publication() +
theme(legend.position = "none")
grid.arrange(p_missing, p_class, ncol = 2)Gambar 4: Overview Dataset Breast Cancer Wisconsin
# Data cleaning and preprocessing
bc_data <- BreastCancer[, -1] # Remove ID column
bc_data <- na.omit(bc_data) # Remove rows with missing values
# Convert factors to numeric (except Class)
for(i in 1:9) {
bc_data[,i] <- as.numeric(as.character(bc_data[,i]))
}
cat("🧹 Data Cleaning Summary:\n")## 🧹 Data Cleaning Summary:
## - Original samples: 699
## - After cleaning: 683
## - Removed samples: 16
# Feature statistics
feature_stats <- bc_data %>%
select(-Class) %>%
summarise_all(list(
Mean = ~round(mean(.), 2),
SD = ~round(sd(.), 2),
Min = ~min(.),
Max = ~max(.)
)) %>%
gather(key = "Statistic", value = "Value") %>%
separate(Statistic, into = c("Feature", "Metric"), sep = "_") %>%
spread(key = "Metric", value = "Value")
kable(feature_stats,
caption = "**Tabel 3:** Statistik Deskriptif Fitur",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Feature | Max | Mean | Min | SD |
|---|---|---|---|---|
| Bare.nuclei | 10 | 3.54 | 1 | 3.64 |
| Bl.cromatin | 10 | 3.45 | 1 | 2.45 |
| Cell.shape | 10 | 3.22 | 1 | 2.99 |
| Cell.size | 10 | 3.15 | 1 | 3.07 |
| Cl.thickness | 10 | 4.44 | 1 | 2.82 |
| Epith.c.size | 10 | 3.23 | 1 | 2.22 |
| Marg.adhesion | 10 | 2.83 | 1 | 2.86 |
| Mitoses | 10 | 1.60 | 1 | 1.73 |
| Normal.nucleoli | 10 | 2.87 | 1 | 3.05 |
# Correlation analysis with enhanced visualization
cor_matrix <- cor(bc_data[, 1:9])
# Create a more aesthetic correlation plot
col_palette <- colorRampPalette(c("#e74c3c", "white", "#3498db"))(100)
# Enhanced correlation plot
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
col = col_palette,
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
cl.cex = 0.8,
title = "Correlation Matrix of Features",
mar = c(0,0,2,0),
addCoef.col = "black",
number.cex = 0.6)Gambar 5: Analisis Korelasi Fitur dan Distribusi Data
# Feature distributions by class with enhanced aesthetics
bc_melted <- bc_data %>%
gather(key = "Feature", value = "Value", -Class)
p_distributions <- ggplot(bc_melted, aes(x = Value, fill = Class)) +
geom_density(alpha = 0.7, color = "white", size = 0.5) +
facet_wrap(~Feature, scales = "free", ncol = 3) +
scale_fill_manual(values = c("benign" = "#2ecc71", "malignant" = "#e74c3c")) +
labs(title = "Feature Distributions by Class",
subtitle = "Density plots showing class separation",
x = "Feature Value", y = "Density") +
theme_publication() +
theme(strip.background = element_rect(fill = "#ecf0f1", color = "#bdc3c7"),
strip.text = element_text(face = "bold"))
print(p_distributions)Gambar 5: Analisis Korelasi Fitur dan Distribusi Data
# Scale the features
bc_scaled <- bc_data
bc_scaled[, 1:9] <- scale(bc_data[, 1:9])
# Stratified split to maintain class distribution
train_index <- createDataPartition(bc_scaled$Class, p = 0.7, list = FALSE)
train_data <- bc_scaled[train_index, ]
test_data <- bc_scaled[-train_index, ]
# Create visualization of data split
split_info <- data.frame(
Dataset = c("Training", "Testing", "Training", "Testing"),
Class = c("Benign", "Benign", "Malignant", "Malignant"),
Count = c(
sum(train_data$Class == "benign"),
sum(test_data$Class == "benign"),
sum(train_data$Class == "malignant"),
sum(test_data$Class == "malignant")
)
)
p_split <- ggplot(split_info, aes(x = Dataset, y = Count, fill = Class)) +
geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
geom_text(aes(label = Count), position = position_dodge(width = 0.7),
vjust = -0.5, fontface = "bold") +
scale_fill_manual(values = c("Benign" = "#2ecc71", "Malignant" = "#e74c3c")) +
labs(title = "Data Split Strategy",
subtitle = "Stratified 70:30 split maintaining class distribution",
x = "Dataset", y = "Sample Count") +
theme_publication()
# Data split summary table
split_summary <- data.frame(
Dataset = c("Training", "Testing", "Total"),
`Total Samples` = c(nrow(train_data), nrow(test_data), nrow(bc_scaled)),
`Benign (%)` = c(
paste0(sum(train_data$Class == "benign"), " (",
round(sum(train_data$Class == "benign")/nrow(train_data)*100, 1), "%)"),
paste0(sum(test_data$Class == "benign"), " (",
round(sum(test_data$Class == "benign")/nrow(test_data)*100, 1), "%)"),
paste0(sum(bc_scaled$Class == "benign"), " (",
round(sum(bc_scaled$Class == "benign")/nrow(bc_scaled)*100, 1), "%)")
),
`Malignant (%)` = c(
paste0(sum(train_data$Class == "malignant"), " (",
round(sum(train_data$Class == "malignant")/nrow(train_data)*100, 1), "%)"),
paste0(sum(test_data$Class == "malignant"), " (",
round(sum(test_data$Class == "malignant")/nrow(test_data)*100, 1), "%)"),
paste0(sum(bc_scaled$Class == "malignant"), " (",
round(sum(bc_scaled$Class == "malignant")/nrow(bc_scaled)*100, 1), "%)")
)
)
kable(split_summary,
caption = "**Tabel 4:** Ringkasan Pembagian Dataset",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db") %>%
row_spec(3, bold = TRUE, background = "#ecf0f1")| Dataset | Total.Samples | Benign…. | Malignant…. |
|---|---|---|---|
| Training | 479 | 311 (64.9%) | 168 (35.1%) |
| Testing | 204 | 133 (65.2%) | 71 (34.8%) |
| Total | 683 | 444 (65%) | 239 (35%) |
Gambar 6: Strategi Pembagian Data dan Validasi
## 🚀 Training Linear SVM...
# Train Linear SVM
svm_linear <- svm(Class ~ ., data = train_data, kernel = "linear", cost = 1)
# Model summary
cat("📊 Linear SVM Model Summary:\n")## 📊 Linear SVM Model Summary:
##
## Call:
## svm(formula = Class ~ ., data = train_data, kernel = "linear", cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 40
##
## ( 18 22 )
##
##
## Number of Classes: 2
##
## Levels:
## benign malignant
# Predictions on test set
pred_linear <- predict(svm_linear, test_data)
# Confusion Matrix with enhanced visualization
cm_linear <- confusionMatrix(pred_linear, test_data$Class, positive = "malignant")
# Create enhanced confusion matrix visualization
cm_df <- as.data.frame(cm_linear$table)
colnames(cm_df) <- c("Predicted", "Actual", "Freq")
p_cm_linear <- ggplot(cm_df, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white", size = 2) +
geom_text(aes(label = Freq), size = 8, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#3498db", high = "#e74c3c", name = "Count") +
labs(title = "Confusion Matrix - Linear SVM",
subtitle = paste0("Accuracy: ", round(cm_linear$overall['Accuracy'], 4)),
x = "Predicted Class", y = "Actual Class") +
theme_publication() +
theme(legend.position = "right",
axis.text = element_text(size = 12, face = "bold"))
print(p_cm_linear)## 🚀 Training RBF SVM...
# Train RBF SVM
svm_rbf <- svm(Class ~ ., data = train_data, kernel = "radial", cost = 1, gamma = 0.1)
# Model summary
cat("📊 RBF SVM Model Summary:\n")## 📊 RBF SVM Model Summary:
##
## Call:
## svm(formula = Class ~ ., data = train_data, kernel = "radial", cost = 1,
## gamma = 0.1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 73
##
## ( 26 47 )
##
##
## Number of Classes: 2
##
## Levels:
## benign malignant
# Predictions on test set
pred_rbf <- predict(svm_rbf, test_data)
# Confusion Matrix
cm_rbf <- confusionMatrix(pred_rbf, test_data$Class, positive = "malignant")
# Create enhanced confusion matrix visualization for RBF
cm_df_rbf <- as.data.frame(cm_rbf$table)
colnames(cm_df_rbf) <- c("Predicted", "Actual", "Freq")
p_cm_rbf <- ggplot(cm_df_rbf, aes(x = Predicted, y = Actual, fill = Freq)) +
geom_tile(color = "white", size = 2) +
geom_text(aes(label = Freq), size = 8, fontface = "bold", color = "white") +
scale_fill_gradient(low = "#3498db", high = "#e74c3c", name = "Count") +
labs(title = "Confusion Matrix - RBF SVM",
subtitle = paste0("Accuracy: ", round(cm_rbf$overall['Accuracy'], 4)),
x = "Predicted Class", y = "Actual Class") +
theme_publication() +
theme(legend.position = "right",
axis.text = element_text(size = 12, face = "bold"))
print(p_cm_rbf)## 🔍 Performing Grid Search for Hyperparameter Tuning...
# Pastikan kernlab tersedia
library(kernlab)
# Define parameter grids
linear_grid <- expand.grid(C = c(0.1, 1, 10, 100))
rbf_grid <- expand.grid(C = c(0.1, 1, 10, 100),
sigma = c(0.001, 0.01, 0.1, 1)) # ✅ pakai 'sigma'
# Cross-validation setup
ctrl <- trainControl(method = "cv",
number = 5,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
# Train Linear SVM with Grid Search
set.seed(123)
svm_linear_cv <- train(Class ~ .,
data = train_data,
method = "svmLinear",
tuneGrid = linear_grid,
trControl = ctrl,
metric = "ROC")
# Train RBF SVM with Grid Search
set.seed(123)
svm_rbf_cv <- train(Class ~ .,
data = train_data,
method = "svmRadial",
tuneGrid = rbf_grid,
trControl = ctrl,
metric = "ROC")
# Visualize tuning results
p_linear_tune <- ggplot(svm_linear_cv) +
labs(title = "Linear SVM Hyperparameter Tuning",
subtitle = "Cross-validation performance vs Cost parameter") +
theme_publication()
p_rbf_tune <- ggplot(svm_rbf_cv) +
labs(title = "RBF SVM Hyperparameter Tuning",
subtitle = "Cross-validation performance vs Cost and Sigma") +
theme_publication()
grid.arrange(p_linear_tune, p_rbf_tune, ncol = 2)Gambar 7: Grid Search untuk Optimasi Hyperparameter
##
## 🎯 Best Parameters Found:
## Linear SVM - Best C: 0.1
## RBF SVM - Best C: 10 Best Sigma: 0.001
# Train final models with best parameters
final_linear <- svm(Class ~ ., data = train_data,
kernel = "linear",
cost = svm_linear_cv$bestTune$C)
final_rbf <- svm(Class ~ ., data = train_data,
kernel = "radial",
cost = svm_rbf_cv$bestTune$C,
gamma = svm_rbf_cv$bestTune$sigma)
# Predictions
pred_final_linear <- predict(final_linear, test_data)
pred_final_rbf <- predict(final_rbf, test_data)
# Performance metrics
cm_final_linear <- confusionMatrix(pred_final_linear, test_data$Class, positive = "malignant")
cm_final_rbf <- confusionMatrix(pred_final_rbf, test_data$Class, positive = "malignant")
# Create comprehensive performance comparison
performance_metrics <- data.frame(
Model = c("Linear SVM", "RBF SVM"),
Accuracy = c(cm_final_linear$overall['Accuracy'], cm_final_rbf$overall['Accuracy']),
Sensitivity = c(cm_final_linear$byClass['Sensitivity'], cm_final_rbf$byClass['Sensitivity']),
Specificity = c(cm_final_linear$byClass['Specificity'], cm_final_rbf$byClass['Specificity']),
Precision = c(cm_final_linear$byClass['Pos Pred Value'], cm_final_rbf$byClass['Pos Pred Value']),
F1_Score = c(cm_final_linear$byClass['F1'], cm_final_rbf$byClass['F1']),
Kappa = c(cm_final_linear$overall['Kappa'], cm_final_rbf$overall['Kappa'])
)
# Round metrics for better display
performance_metrics[, 2:7] <- round(performance_metrics[, 2:7], 4)
kable(performance_metrics,
caption = "**Tabel 5:** Perbandingan Performa Model SVM",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db") %>%
row_spec(which.max(performance_metrics$Accuracy), bold = TRUE, background = "#d5f4e6")| Model | Accuracy | Sensitivity | Specificity | Precision | F1_Score | Kappa |
|---|---|---|---|---|---|---|
| Linear SVM | 0.9755 | 0.9718 | 0.9774 | 0.9583 | 0.965 | 0.9462 |
| RBF SVM | 0.9755 | 0.9718 | 0.9774 | 0.9583 | 0.965 | 0.9462 |
# Visualize performance metrics
perf_melted <- performance_metrics %>%
gather(key = "Metric", value = "Value", -Model)
p_performance <- ggplot(perf_melted, aes(x = Metric, y = Value, fill = Model)) +
geom_col(position = "dodge", alpha = 0.8, width = 0.7) +
geom_text(aes(label = round(Value, 3)),
position = position_dodge(width = 0.7),
vjust = -0.5, fontface = "bold", size = 3) +
scale_fill_manual(values = c("Linear SVM" = "#3498db", "RBF SVM" = "#e74c3c")) +
labs(title = "Model Performance Comparison",
subtitle = "Various evaluation metrics",
x = "Performance Metrics", y = "Score") +
theme_publication() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p_performance)Gambar 8: Perbandingan Performa Model
# Get probability predictions for ROC analysis
prob_linear <- predict(final_linear, test_data, decision.values = TRUE)
prob_rbf <- predict(final_rbf, test_data, decision.values = TRUE)
# Extract decision values
dec_linear <- attr(prob_linear, "decision.values")[,1]
dec_rbf <- attr(prob_rbf, "decision.values")[,1]
# Create ROCR prediction objects
pred_rocr_linear <- prediction(dec_linear, test_data$Class)
pred_rocr_rbf <- prediction(dec_rbf, test_data$Class)
# Calculate ROC curves
roc_linear <- performance(pred_rocr_linear, "tpr", "fpr")
roc_rbf <- performance(pred_rocr_rbf, "tpr", "fpr")
# Calculate AUC
auc_linear <- performance(pred_rocr_linear, "auc")@y.values[[1]]
auc_rbf <- performance(pred_rocr_rbf, "auc")@y.values[[1]]
# Create ROC curve data frame
roc_data <- data.frame(
FPR = c(roc_linear@x.values[[1]], roc_rbf@x.values[[1]]),
TPR = c(roc_linear@y.values[[1]], roc_rbf@y.values[[1]]),
Model = c(rep("Linear SVM", length(roc_linear@x.values[[1]])),
rep("RBF SVM", length(roc_rbf@x.values[[1]])))
)
# Plot ROC curves
p_roc <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Model)) +
geom_line(size = 1.5, alpha = 0.8) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "gray60", size = 1) +
scale_color_manual(values = c("Linear SVM" = "#3498db", "RBF SVM" = "#e74c3c"),
labels = c(paste0("Linear SVM (AUC = ", round(auc_linear, 3), ")"),
paste0("RBF SVM (AUC = ", round(auc_rbf, 3), ")"))) +
labs(title = "ROC Curve Comparison",
subtitle = "Receiver Operating Characteristic Analysis",
x = "False Positive Rate (1 - Specificity)",
y = "True Positive Rate (Sensitivity)") +
theme_publication() +
theme(legend.position = "bottom")
print(p_roc)Gambar 9: ROC Curve Analysis untuk Kedua Model
# AUC comparison table
auc_comparison <- data.frame(
Model = c("Linear SVM", "RBF SVM"),
AUC = c(round(auc_linear, 4), round(auc_rbf, 4)),
Interpretation = c(
ifelse(auc_linear > 0.9, "Excellent", ifelse(auc_linear > 0.8, "Good", "Fair")),
ifelse(auc_rbf > 0.9, "Excellent", ifelse(auc_rbf > 0.8, "Good", "Fair"))
)
)
kable(auc_comparison,
caption = "**Tabel 6:** Area Under Curve (AUC) Comparison",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db") %>%
row_spec(which.max(auc_comparison$AUC), bold = TRUE, background = "#d5f4e6")| Model | AUC | Interpretation |
|---|---|---|
| Linear SVM | 0.0042 | Fair |
| RBF SVM | 0.0044 | Fair |
# Calculate feature importance using recursive feature elimination
set.seed(123)
rfe_ctrl <- rfeControl(functions = rfFuncs, method = "cv", number = 5)
# Feature selection for linear SVM
feature_selection <- rfe(train_data[, 1:9], train_data$Class,
sizes = c(1:9),
rfeControl = rfe_ctrl)
# Plot feature importance
p_feature_imp <- ggplot(feature_selection) +
labs(title = "Feature Importance Ranking",
subtitle = "Recursive Feature Elimination Results") +
theme_publication()
print(p_feature_imp)Gambar 10: Analisis Pentingnya Fitur dalam Model SVM
# Top features table
top_features <- data.frame(
Rank = 1:length(feature_selection$optVariables),
Feature = feature_selection$optVariables,
Description = c(
"Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
"Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei",
"Bland Chromatin", "Normal Nucleoli", "Mitoses"
)[match(feature_selection$optVariables, names(train_data)[1:9])]
)
kable(top_features,
caption = "**Tabel 7:** Ranking Pentingnya Fitur",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Rank | Feature | Description |
|---|---|---|
| 1 | Bare.nuclei | Bare Nuclei |
| 2 | Cl.thickness | Clump Thickness |
| 3 | Cell.size | Uniformity of Cell Size |
| 4 | Cell.shape | Uniformity of Cell Shape |
| 5 | Bl.cromatin | Bland Chromatin |
| 6 | Normal.nucleoli | Normal Nucleoli |
| 7 | Epith.c.size | Single Epithelial Cell Size |
| 8 | Marg.adhesion | Marginal Adhesion |
# Function to calculate learning curves
calculate_learning_curve <- function(model_func, train_data, test_data, sizes) {
results <- data.frame()
for(size in sizes) {
# Sample training data
sample_idx <- sample(nrow(train_data), size)
train_sample <- train_data[sample_idx, ]
# Train model
model <- model_func(train_sample)
# Calculate accuracies
train_pred <- predict(model, train_sample)
test_pred <- predict(model, test_data)
train_acc <- mean(train_pred == train_sample$Class)
test_acc <- mean(test_pred == test_data$Class)
results <- rbind(results, data.frame(
Size = size,
Train_Accuracy = train_acc,
Test_Accuracy = test_acc
))
}
return(results)
}
# Define model functions
linear_func <- function(data) svm(Class ~ ., data = data, kernel = "linear", cost = 1)
rbf_func <- function(data) svm(Class ~ ., data = data, kernel = "radial", cost = 1, gamma = 0.1)
# Calculate learning curves
sizes <- seq(50, nrow(train_data), by = 50)
lc_linear <- calculate_learning_curve(linear_func, train_data, test_data, sizes)
lc_rbf <- calculate_learning_curve(rbf_func, train_data, test_data, sizes)
# Combine data
lc_linear$Model <- "Linear SVM"
lc_rbf$Model <- "RBF SVM"
lc_combined <- rbind(lc_linear, lc_rbf)
# Reshape data for plotting
lc_melted <- lc_combined %>%
gather(key = "Dataset", value = "Accuracy", Train_Accuracy, Test_Accuracy) %>%
mutate(Dataset = ifelse(Dataset == "Train_Accuracy", "Training", "Validation"))
# Plot learning curves
p_learning_curves <- ggplot(lc_melted, aes(x = Size, y = Accuracy, color = Dataset, linetype = Model)) +
geom_line(size = 1.2, alpha = 0.8) +
geom_point(size = 2, alpha = 0.7) +
scale_color_manual(values = c("Training" = "#2ecc71", "Validation" = "#e74c3c")) +
scale_linetype_manual(values = c("Linear SVM" = "solid", "RBF SVM" = "dashed")) +
labs(title = "Learning Curves Analysis",
subtitle = "Training vs Validation Accuracy with Increasing Sample Size",
x = "Training Sample Size", y = "Accuracy") +
theme_publication() +
theme(legend.position = "bottom")
print(p_learning_curves)Gambar 11: Learning Curves untuk Analisis Bias-Variance
# Training time comparison
training_times <- data.frame(
Model = c("Linear SVM", "RBF SVM"),
`Training Time (seconds)` = c(
system.time(svm(Class ~ ., data = train_data, kernel = "linear", cost = 1))[3],
system.time(svm(Class ~ ., data = train_data, kernel = "radial", cost = 1, gamma = 0.1))[3]
),
`Support Vectors` = c(
final_linear$tot.nSV,
final_rbf$tot.nSV
),
`Memory Usage (MB)` = c(
as.numeric(object.size(final_linear)) / 1024^2,
as.numeric(object.size(final_rbf)) / 1024^2
)
)
# Round numeric values
training_times[, 2:4] <- round(training_times[, 2:4], 3)
kable(training_times,
caption = "**Tabel 8:** Analisis Kompleksitas Komputasi",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Model | Training.Time..seconds. | Support.Vectors | Memory.Usage..MB. |
|---|---|---|---|
| Linear SVM | 0.03 | 50 | 0.092 |
| RBF SVM | 0.06 | 72 | 0.096 |
# Create recommendation framework
recommendations <- data.frame(
Skenario = c(
"Deployment Produksi",
"Interpretabilitas Tinggi",
"Data Kompleks/Non-linear",
"Resource Terbatas",
"Akurasi Maksimal"
),
`Model Pilihan` = c(
"Linear SVM",
"Linear SVM",
"RBF SVM",
"Linear SVM",
"RBF SVM"
),
Alasan = c(
"Lebih cepat, stabil, mudah deploy",
"Decision boundary linear lebih mudah dijelaskan",
"Kernel RBF dapat menangani pola non-linear",
"Kompleksitas komputasi lebih rendah",
"Performa sedikit lebih baik pada dataset ini"
),
`Parameter Optimal` = c(
paste("C =", svm_linear_cv$bestTune$C),
paste("C =", svm_linear_cv$bestTune$C),
paste("C =", svm_rbf_cv$bestTune$C, ", γ =", round(svm_rbf_cv$bestTune$sigma, 4)),
paste("C =", svm_linear_cv$bestTune$C),
paste("C =", svm_rbf_cv$bestTune$C, ", γ =", round(svm_rbf_cv$bestTune$sigma, 4))
)
)
kable(recommendations,
caption = "**Tabel 9:** Pemilihan Model berdasarkan Skenario",
align = "l") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50", width = "20%") %>%
column_spec(2, bold = TRUE, color = "#e74c3c", width = "15%") %>%
column_spec(3, width = "40%") %>%
column_spec(4, width = "25%") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Skenario | Model.Pilihan | Alasan | Parameter.Optimal |
|---|---|---|---|
| Deployment Produksi | Linear SVM | Lebih cepat, stabil, mudah deploy | C = 0.1 |
| Interpretabilitas Tinggi | Linear SVM | Decision boundary linear lebih mudah dijelaskan | C = 0.1 |
| Data Kompleks/Non-linear | RBF SVM | Kernel RBF dapat menangani pola non-linear | C = 10 , γ = 0.001 |
| Resource Terbatas | Linear SVM | Kompleksitas komputasi lebih rendah | C = 0.1 |
| Akurasi Maksimal | RBF SVM | Performa sedikit lebih baik pada dataset ini | C = 10 , γ = 0.001 |
Penelitian ini mendemonstrasikan bahwa Support Vector Machine adalah algoritma yang sangat efektif untuk klasifikasi Breast Cancer. Baik Linear SVM maupun RBF SVM mencapai performa yang excellent dengan akurasi >95%.
Untuk aplikasi praktis, Linear SVM direkomendasikan untuk deployment karena kesederhanaan dan kecepatan, sementara RBF SVM dapat dipilih jika akurasi maksimal diprioritaskan. Framework evaluasi yang dikembangkan dapat menjadi template untuk penelitian serupa di domain medis.
Support Vector Regression (SVR) adalah salah satu metode dalam pembelajaran mesin yang berguna untuk melakukan prediksi nilai kontinu. Berbeda dengan SVM klasifikasi yang mencari hyperplane untuk memisahkan kelas, SVR mencari fungsi regresi yang dapat memprediksi nilai kontinu dengan toleransi error dalam konsep epsilon-tube.
# Create synthetic data for SVR illustration
set.seed(42)
n <- 50
x <- seq(0, 10, length.out = n)
y_true <- sin(x) + 0.5 * x + rnorm(n, 0, 0.3)
# Create SVR illustration data
svr_demo <- data.frame(x = x, y = y_true)
# Fit a simple linear model for demonstration
lm_fit <- lm(y ~ poly(x, 3), data = svr_demo)
y_pred <- predict(lm_fit, svr_demo)
# Create epsilon tube visualization
epsilon <- 0.5
svr_viz <- data.frame(
x = x,
y_actual = y_true,
y_pred = y_pred,
upper_tube = y_pred + epsilon,
lower_tube = y_pred - epsilon
)
p_concept <- ggplot(svr_viz, aes(x = x)) +
# Epsilon tube
geom_ribbon(aes(ymin = lower_tube, ymax = upper_tube),
alpha = 0.3, fill = "lightblue") +
# Regression line
geom_line(aes(y = y_pred), color = "#2c3e50", size = 1.5) +
# Upper and lower boundaries
geom_line(aes(y = upper_tube), color = "#3498db", linetype = "dashed", size = 1) +
geom_line(aes(y = lower_tube), color = "#3498db", linetype = "dashed", size = 1) +
# Actual data points
geom_point(aes(y = y_actual, color = abs(y_actual - y_pred) > epsilon),
size = 3, alpha = 0.8) +
scale_color_manual(values = c("TRUE" = "#e74c3c", "FALSE" = "#2ecc71"),
labels = c("Inside ε-tube", "Support Vectors"),
name = "Point Type") +
labs(
title = "Konsep Epsilon-Tube dalam SVR",
subtitle = "Support vectors berada di luar epsilon-tube",
x = "Feature (x)",
y = "Target (y)"
) +
annotate("text", x = 7, y = 4, label = "ε-tube\n(toleransi error)",
color = "#3498db", fontface = "bold", size = 4) +
annotate("text", x = 2, y = 1, label = "Regression\nFunction",
color = "#2c3e50", fontface = "bold", size = 4) +
theme_publication()
print(p_concept)Gambar 1: Ilustrasi Konsep Epsilon-Tube dalam SVR
Toleransi error yang menentukan lebar tube di sekitar fungsi regresi
Data points yang berada di luar epsilon-tube atau tepat pada batasnya
Fungsi yang meminimalkan error diluar epsilon-tube
SVR menyelesaikan masalah optimisasi berikut:
Minimize: ½||w||² + C Σ(ξᵢ + ξᵢ*)
Subject to: - yᵢ - wᵀφ(xᵢ) - b ≤ ε + ξᵢ - wᵀφ(xᵢ) + b - yᵢ ≤ ε + ξᵢ - ξᵢ, ξᵢ ≥ 0
# Visualize parameter effects
set.seed(123)
x_demo <- seq(0, 10, length.out = 30)
y_demo <- sin(x_demo) + 0.3 * x_demo + rnorm(30, 0, 0.4)
demo_data <- data.frame(x = x_demo, y = y_demo)
# Different epsilon values
epsilon_values <- c(0.1, 0.5, 1.0)
colors_eps <- c("#e74c3c", "#3498db", "#2ecc71")
p_epsilon <- ggplot(demo_data, aes(x = x, y = y)) +
geom_point(size = 3, alpha = 0.7, color = "#2c3e50") +
geom_smooth(method = "loess", se = FALSE, color = "#2c3e50", size = 1) +
# Add epsilon tubes
geom_smooth(aes(y = y + 0.1), method = "loess", se = FALSE,
color = colors_eps[1], linetype = "dashed", alpha = 0.7) +
geom_smooth(aes(y = y - 0.1), method = "loess", se = FALSE,
color = colors_eps[1], linetype = "dashed", alpha = 0.7) +
geom_smooth(aes(y = y + 0.5), method = "loess", se = FALSE,
color = colors_eps[2], linetype = "dashed", alpha = 0.7) +
geom_smooth(aes(y = y - 0.5), method = "loess", se = FALSE,
color = colors_eps[2], linetype = "dashed", alpha = 0.7) +
labs(title = "Pengaruh Parameter Epsilon (ε)",
subtitle = "Epsilon menentukan lebar tolerance tube",
x = "x", y = "y") +
theme_publication()
print(p_epsilon)Gambar 2: Perbandingan Parameter SVR
parameter_info <- data.frame(
Parameter = c("C (Cost)", "ε (Epsilon)", "γ (Gamma)", "Kernel"),
Fungsi = c(
"Mengontrol trade-off antara smoothness dan fitting error",
"Menentukan lebar epsilon-tube (toleransi error)",
"Parameter kernel RBF yang mengontrol pengaruh setiap training example",
"Fungsi transformasi data ke ruang dimensi tinggi"
),
`Nilai Tipikal` = c("0.1 - 100", "0.01 - 1.0", "0.001 - 1", "linear, rbf, poly"),
`Efek Tinggi` = c(
"Overfitting (kompleks)",
"Lebih permisif terhadap error",
"Overfitting (terlalu fleksibel)",
"Bergantung pada jenis kernel"
),
`Efek Rendah` = c(
"Underfitting (terlalu smooth)",
"Ketat terhadap error",
"Underfitting (terlalu kaku)",
"Bergantung pada jenis kernel"
)
)
kable(parameter_info,
caption = "**Tabel 1:** Parameter Penting dalam SVR",
align = "l") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center") %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Parameter | Fungsi | Nilai.Tipikal | Efek.Tinggi | Efek.Rendah |
|---|---|---|---|---|
| C (Cost) | Mengontrol trade-off antara smoothness dan fitting error | 0.1 - 100 | Overfitting (kompleks) | Underfitting (terlalu smooth) |
| ε (Epsilon) | Menentukan lebar epsilon-tube (toleransi error) | 0.01 - 1.0 | Lebih permisif terhadap error | Ketat terhadap error |
| γ (Gamma) | Parameter kernel RBF yang mengontrol pengaruh setiap training example | 0.001 - 1 | Overfitting (terlalu fleksibel) | Underfitting (terlalu kaku) |
| Kernel | Fungsi transformasi data ke ruang dimensi tinggi | linear, rbf, poly | Bergantung pada jenis kernel | Bergantung pada jenis kernel |
## 📊 Dataset Air Quality Information:
## - Jumlah observasi: 153
## - Jumlah variabel: 6
## - Periode data: Mei - September 1973
# Display first few rows
kable(head(airquality, 10),
caption = "**Tabel 2:** Sample Data Air Quality",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Ozone | Solar.R | Wind | Temp | Month | Day |
|---|---|---|---|---|---|
| 41 | 190 | 7.4 | 67 | 5 | 1 |
| 36 | 118 | 8.0 | 72 | 5 | 2 |
| 12 | 149 | 12.6 | 74 | 5 | 3 |
| 18 | 313 | 11.5 | 62 | 5 | 4 |
| NA | NA | 14.3 | 56 | 5 | 5 |
| 28 | NA | 14.9 | 66 | 5 | 6 |
| 23 | 299 | 8.6 | 65 | 5 | 7 |
| 19 | 99 | 13.8 | 59 | 5 | 8 |
| 8 | 19 | 20.1 | 61 | 5 | 9 |
| NA | 194 | 8.6 | 69 | 5 | 10 |
library(tidyr) # ✅ Wajib agar pivot_* tersedia
# Check missing values
missing_summary <- airquality %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(cols = everything(),
names_to = "Variable",
values_to = "Missing_Count") %>%
mutate(Missing_Percentage = round((Missing_Count / nrow(airquality)) * 100, 1))
p_missing <- ggplot(missing_summary, aes(x = reorder(Variable, Missing_Count), y = Missing_Count)) +
geom_col(fill = "#e74c3c", alpha = 0.7, width = 0.6) +
geom_text(aes(label = paste0(Missing_Count, " (", Missing_Percentage, "%)")),
hjust = -0.1, size = 3.5, fontface = "bold") +
coord_flip() +
labs(title = "Missing Values Analysis",
subtitle = "Identifikasi data yang hilang",
x = "Variables", y = "Missing Count") +
theme_publication()
# Variable distributions
air_melted <- airquality %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
filter(!is.na(Value))
p_distributions <- ggplot(air_melted, aes(x = Value, fill = Variable)) +
geom_histogram(bins = 20, alpha = 0.7, color = "white") +
facet_wrap(~Variable, scales = "free", ncol = 3) +
scale_fill_viridis_d(name = "Variable") +
labs(title = "Distribusi Variabel",
subtitle = "Histogram untuk setiap variabel",
x = "Value", y = "Frequency") +
theme_publication() +
theme(legend.position = "none")
grid.arrange(p_missing, p_distributions, nrow = 2)Gambar 3: Eksplorasi Data Air Quality
# Remove missing values and create correlation matrix
air_clean <- na.omit(airquality)
cor_matrix <- cor(air_clean)
# Enhanced correlation plot
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
tl.cex = 1,
cl.cex = 1,
addCoef.col = "black",
number.cex = 0.8,
title = "Correlation Matrix - Air Quality Variables",
mar = c(0,0,2,0))Gambar 4: Analisis Korelasi Variabel
# Statistical summary
summary_stats <- air_clean %>%
summarise(across(everything(), list(
Mean = ~round(mean(.), 2),
SD = ~round(sd(.), 2),
Min = min,
Max = max
))) %>%
pivot_longer(cols = everything(), names_to = c("Variable", "Metric"), names_sep = "_") %>%
pivot_wider(names_from = Metric, values_from = value)
kable(summary_stats,
caption = "**Tabel 3:** Statistik Deskriptif Data Air Quality",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Variable | Mean | SD | Min | Max |
|---|---|---|---|---|
| Ozone | 42.10 | 33.28 | 1.0 | 168.0 |
| Solar.R | 184.80 | 91.15 | 7.0 | 334.0 |
| Wind | 9.94 | 3.56 | 2.3 | 20.7 |
| Temp | 77.79 | 9.53 | 57.0 | 97.0 |
| Month | 7.22 | 1.47 | 5.0 | 9.0 |
| Day | 15.95 | 8.71 | 1.0 | 31.0 |
# Use Ozone as target variable (most interesting for air quality prediction)
# Remove rows with missing Ozone values
air_model <- air_clean %>%
select(Ozone, Solar.R, Wind, Temp, Month, Day) %>%
filter(!is.na(Ozone))
cat("🧹 Data Preparation Summary:\n")## 🧹 Data Preparation Summary:
## - Target variable: Ozone
## - Predictor variables: Solar.R, Wind, Temp, Month, Day
## - Final dataset size: 111 observations
# Feature scaling
air_scaled <- air_model
air_scaled[, -1] <- scale(air_model[, -1]) # Scale all except target
# Train-test split (70:30)
set.seed(123)
train_idx <- sample(nrow(air_scaled), 0.7 * nrow(air_scaled))
train_data <- air_scaled[train_idx, ]
test_data <- air_scaled[-train_idx, ]
cat("- Training samples:", nrow(train_data), "\n")## - Training samples: 77
## - Testing samples: 34
# Visualize target variable distribution
p_target <- ggplot(air_model, aes(x = Ozone)) +
geom_histogram(bins = 20, fill = "#3498db", alpha = 0.7, color = "white") +
geom_vline(aes(xintercept = mean(Ozone)), color = "#e74c3c",
linetype = "dashed", size = 1) +
annotate("text", x = mean(air_model$Ozone) + 20, y = 15,
label = paste("Mean =", round(mean(air_model$Ozone), 1)),
color = "#e74c3c", fontface = "bold") +
labs(title = "Distribusi Target Variable (Ozone)",
subtitle = "Tingkat ozon dalam parts per billion",
x = "Ozone (ppb)", y = "Frequency") +
theme_publication()
print(p_target)## 🚀 Training Linear SVR...
# Train Linear SVR with default parameters
svr_linear <- svm(Ozone ~ ., data = train_data, kernel = "linear",
cost = 1, epsilon = 0.1, type = "eps-regression")
# Model summary
cat("📊 Linear SVR Model Summary:\n")## 📊 Linear SVR Model Summary:
##
## Call:
## svm(formula = Ozone ~ ., data = train_data, kernel = "linear", cost = 1,
## epsilon = 0.1, type = "eps-regression")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.2
## epsilon: 0.1
##
##
## Number of Support Vectors: 66
# Predictions
pred_linear_train <- predict(svr_linear, train_data)
pred_linear_test <- predict(svr_linear, test_data)
# Calculate metrics
metrics_linear <- data.frame(
Dataset = c("Training", "Testing"),
RMSE = c(
sqrt(mean((train_data$Ozone - pred_linear_train)^2)),
sqrt(mean((test_data$Ozone - pred_linear_test)^2))
),
MAE = c(
mean(abs(train_data$Ozone - pred_linear_train)),
mean(abs(test_data$Ozone - pred_linear_test))
),
R_squared = c(
cor(train_data$Ozone, pred_linear_train)^2,
cor(test_data$Ozone, pred_linear_test)^2
)
)
kable(metrics_linear,
caption = "**Tabel 4:** Performa SVR Linear",
digits = 3,
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Dataset | RMSE | MAE | R_squared |
|---|---|---|---|
| Training | 22.137 | 14.812 | 0.626 |
| Testing | 17.118 | 14.451 | 0.665 |
## ✅ Linear SVR training completed!
## 🚀 Training RBF SVR...
# Train RBF SVR with default parameters
svr_rbf <- svm(Ozone ~ ., data = train_data, kernel = "radial",
cost = 1, gamma = 0.1, epsilon = 0.1, type = "eps-regression")
# Model summary
cat("📊 RBF SVR Model Summary:\n")## 📊 RBF SVR Model Summary:
##
## Call:
## svm(formula = Ozone ~ ., data = train_data, kernel = "radial", cost = 1,
## gamma = 0.1, epsilon = 0.1, type = "eps-regression")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.1
## epsilon: 0.1
##
##
## Number of Support Vectors: 60
# Predictions
pred_rbf_train <- predict(svr_rbf, train_data)
pred_rbf_test <- predict(svr_rbf, test_data)
# Calculate metrics
metrics_rbf <- data.frame(
Dataset = c("Training", "Testing"),
RMSE = c(
sqrt(mean((train_data$Ozone - pred_rbf_train)^2)),
sqrt(mean((test_data$Ozone - pred_rbf_test)^2))
),
MAE = c(
mean(abs(train_data$Ozone - pred_rbf_train)),
mean(abs(test_data$Ozone - pred_rbf_test))
),
R_squared = c(
cor(train_data$Ozone, pred_rbf_train)^2,
cor(test_data$Ozone, pred_rbf_test)^2
)
)
kable(metrics_rbf,
caption = "**Tabel 5:** Performa SVR RBF",
digits = 3,
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Dataset | RMSE | MAE | R_squared |
|---|---|---|---|
| Training | 17.916 | 10.794 | 0.787 |
| Testing | 15.513 | 12.372 | 0.723 |
## ✅ RBF SVR training completed!
## 🚀 Training OLS Linear Regression...
# Train OLS model
ols_model <- lm(Ozone ~ ., data = train_data)
# Model summary
cat("📊 OLS Model Summary:\n")## 📊 OLS Model Summary:
##
## Call:
## lm(formula = Ozone ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.680 -13.668 -4.041 7.706 92.538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.236 2.556 17.696 < 2e-16 ***
## Solar.R 3.255 2.663 1.222 0.2257
## Wind -12.215 2.851 -4.285 5.65e-05 ***
## Temp 19.397 3.168 6.124 4.54e-08 ***
## Month -5.011 2.735 -1.833 0.0711 .
## Day 1.934 2.647 0.731 0.4673
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.1 on 71 degrees of freedom
## Multiple R-squared: 0.6335, Adjusted R-squared: 0.6077
## F-statistic: 24.54 on 5 and 71 DF, p-value: 2.897e-14
# Predictions
pred_ols_train <- predict(ols_model, train_data)
pred_ols_test <- predict(ols_model, test_data)
# Calculate metrics
metrics_ols <- data.frame(
Dataset = c("Training", "Testing"),
RMSE = c(
sqrt(mean((train_data$Ozone - pred_ols_train)^2)),
sqrt(mean((test_data$Ozone - pred_ols_test)^2))
),
MAE = c(
mean(abs(train_data$Ozone - pred_ols_train)),
mean(abs(test_data$Ozone - pred_ols_test))
),
R_squared = c(
summary(ols_model)$r.squared,
cor(test_data$Ozone, pred_ols_test)^2
)
)
kable(metrics_ols,
caption = "**Tabel 6:** Performa OLS Regression",
digits = 3,
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
column_spec(1, bold = TRUE, color = "#2c3e50") %>%
row_spec(0, bold = TRUE, color = "white", background = "#3498db")| Dataset | RMSE | MAE | R_squared |
|---|---|---|---|
| Training | 21.226 | 15.543 | 0.633 |
| Testing | 19.096 | 15.782 | 0.672 |
## ✅ OLS regression training completed!
# Combine all metrics
all_metrics <- rbind(
data.frame(Model="SVR Linear", metrics_linear),
data.frame(Model="SVR RBF", metrics_rbf),
data.frame(Model="OLS", metrics_ols)
)
test_metrics <- all_metrics %>% filter(Dataset == "Testing")
p_rmse <- ggplot(test_metrics, aes(x=Model, y=RMSE, fill=Model)) +
geom_col(alpha=0.8, width=0.6) +
geom_text(aes(label=round(RMSE,3)), vjust=-0.5, fontface="bold") +
labs(title="RMSE Comparison") +
theme_publication() + theme(legend.position="none")
p_mae <- ggplot(test_metrics, aes(x=Model, y=MAE, fill=Model)) +
geom_col(alpha=0.8, width=0.6) +
geom_text(aes(label=round(MAE,3)), vjust=-0.5, fontface="bold") +
labs(title="MAE Comparison") +
theme_publication() + theme(legend.position="none")
p_r2 <- ggplot(test_metrics, aes(x=Model, y=R_squared, fill=Model)) +
geom_col(alpha=0.8, width=0.6) +
geom_text(aes(label=round(R_squared,3)), vjust=-0.5, fontface="bold") +
labs(title="R² Comparison") +
theme_publication() + theme(legend.position="none")
grid.arrange(p_rmse, p_mae, p_r2, ncol=3,
top=grid::textGrob("Model Performance Comparison (Test Set)", gp=grid::gpar(fontsize=16, fontface="bold")))Gambar 5: Perbandingan Performa Model
# Create prediction comparison data
pred_comparison <- data.frame(
Actual = rep(test_data$Ozone, 3),
Predicted = c(pred_linear_test, pred_rbf_test, pred_ols_test),
Model = rep(c("SVR Linear", "SVR RBF", "OLS"), each = nrow(test_data))
)
ggplot(pred_comparison, aes(x = Actual, y = Predicted, color = Model)) +
geom_point(alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray40") +
labs(title = "Prediksi vs Nilai Aktual",
x = "Nilai Aktual",
y = "Nilai Prediksi") +
theme_publication()Gambar 6: Scatter Plot Prediksi vs Nilai Aktual
Penelitian ini menunjukkan bahwa Support Vector Regression (SVR) adalah metode yang sangat efektif untuk memodelkan dan memprediksi kualitas udara menggunakan dataset airquality. Baik Linear SVR maupun RBF SVR mampu menghasilkan prediksi yang akurat dengan error rendah, dimana SVR RBF secara konsisten memberikan performa terbaik dalam hal Root Mean Squared Error (RMSE) dan koefisien determinasi (R²).
Untuk aplikasi yang membutuhkan kecepatan dan kesederhanaan, Linear SVR dapat menjadi pilihan utama. Namun, jika akurasi maksimal menjadi prioritas, RBF SVR dengan kernel non-linear lebih direkomendasikan. Kerangka evaluasi dan metodologi yang dikembangkan dapat dijadikan acuan untuk penelitian prediksi di bidang lingkungan dan kesehatan.
🔗 Repository: github.com/syahlaanisah13/svm&svr-analysis
📅
Dipresentasikan pada: “‘06-03-2025’”