rm(list = ls())
set.seed(1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
column_names <- c("SampleCodeNumber", "ClumpThickness", "UniformityOfCellSize", "UniformityOfCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "Class")
cancer_df <- read_csv('./Datasets/Classification/breast-cancer-wisconsin.data',
col_names = column_names, show_col_types = FALSE) %>%
# Convert all columns to numeric
mutate(across(everything(), as.numeric)) %>%
# Drop rows with any NA values
drop_na() %>%
mutate(Class = if_else(Class == 2, 0, 1),
Class = as.factor(Class) )
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Split the data into training and testing sets
num_samples <- nrow(cancer_df)
set.seed(123)
train_indices <- sample(1:num_samples, size = num_samples * 0.8)
train <- cancer_df[train_indices, ]
test <- cancer_df[-train_indices, ]
# a
svmfit_baseline <- svm(Class ~ ., data = train, kernel = "linear", cost = 10, scale = FALSE)
summary(svmfit_baseline)
##
## Call:
## svm(formula = Class ~ ., data = train, kernel = "linear", cost = 10,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 185
##
## ( 89 96 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# plot(svmfit_baseline,train)
# b
set.seed(123) # Ensure reproducibility
tune_result <- tune(svm, Class ~ ., data = train, kernel = "linear",
ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(tune_result)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.01
##
## - best performance: 0.02747475
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.04754209 0.03854555
## 2 1e-02 0.02747475 0.02334574
## 3 1e-01 0.02932660 0.02776840
## 4 1e+00 0.03117845 0.02758441
## 5 5e+00 0.03117845 0.02758441
## 6 1e+01 0.03117845 0.02758441
## 7 1e+02 0.03117845 0.02758441
# Best model
best_model <- tune_result$best.model
summary(best_model)
##
## Call:
## best.tune(METHOD = svm, train.x = Class ~ ., data = train, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.01
##
## Number of Support Vectors: 92
##
## ( 45 47 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# plot(best_model,train)
# c
predictions_baseline <- predict(svmfit_baseline, test)
predictions_tuned <- predict(best_model, test)
# Comparing the accuracy of both models
table(predict = predictions_baseline, truth = test$Class)
## truth
## predict 0 1
## 0 90 47
## 1 0 0
table(predict = predictions_tuned, truth = test$Class)
## truth
## predict 0 1
## 0 87 2
## 1 3 45
# Confusion matrix
confusion_matrix_baseline <- confusionMatrix(predictions_baseline, test$Class)
confusion_matrix_tuned <- confusionMatrix(predictions_tuned, test$Class)
accuracy_baseline <- confusion_matrix_baseline$overall['Accuracy']
accuracy_tuned <- confusion_matrix_tuned$overall['Accuracy']
cat("Baseline Model Accuracy:", accuracy_baseline, "\n")
## Baseline Model Accuracy: 0.6569343
cat("Tuned Model Accuracy:", accuracy_tuned, "\n")
## Tuned Model Accuracy: 0.9635036
The accuracy of the baseline model is 0.6569343, which is relatively low, and it predicts every case as 0, indicating an inadequate model.
The best model is achieved using a cost of 0.01.
The accuracy of the model in (a) is 0.6569343, while the accuracy of the model in (b) is 0.9635036, which is significantly higher than the model in (a).
# a
svmfit_rbf <- svm(Class ~ ., data = train, kernel = "radial", gamma = 1, cost = 1)
summary(svmfit_rbf)
##
## Call:
## svm(formula = Class ~ ., data = train, kernel = "radial", gamma = 1,
## cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 257
##
## ( 66 191 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# plot(svmfit_rbf,train)
predictions_rbf <- predict(svmfit_rbf, test)
confusion_matrix_rbf <- confusionMatrix(as.factor(predictions_rbf), as.factor(test$Class))
cat("RBF Model Accuracy:", confusion_matrix_rbf$overall['Accuracy'], "\n")
## RBF Model Accuracy: 0.9489051
# b
set.seed(123)
tune_result_rbf <- tune(svm, Class ~ ., data = train, kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 100, 1000), gamma = c(0.5, 1, 2, 3, 4)))
summary(tune_result_rbf)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.5
##
## - best performance: 0.04393939
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 1e-01 0.5 0.05680135 0.03293552
## 2 1e+00 0.5 0.04400673 0.02331651
## 3 1e+01 0.5 0.04393939 0.02761137
## 4 1e+02 0.5 0.04393939 0.02761137
## 5 1e+03 0.5 0.04393939 0.02761137
## 6 1e-01 1.0 0.06784512 0.04428239
## 7 1e+00 1.0 0.05494949 0.03858409
## 8 1e+01 1.0 0.05313131 0.03917150
## 9 1e+02 1.0 0.05313131 0.03917150
## 10 1e+03 1.0 0.05313131 0.03917150
## 11 1e-01 2.0 0.08983165 0.05257219
## 12 1e+00 2.0 0.06599327 0.04351301
## 13 1e+01 2.0 0.06228956 0.03978703
## 14 1e+02 2.0 0.06228956 0.03978703
## 15 1e+03 2.0 0.06228956 0.03978703
## 16 1e-01 3.0 0.35185185 0.06817293
## 17 1e+00 3.0 0.08427609 0.04687599
## 18 1e+01 3.0 0.07693603 0.04558426
## 19 1e+02 3.0 0.07693603 0.04558426
## 20 1e+03 3.0 0.07693603 0.04558426
## 21 1e-01 4.0 0.35185185 0.06817293
## 22 1e+00 4.0 0.09535354 0.04917536
## 23 1e+01 4.0 0.09164983 0.04847154
## 24 1e+02 4.0 0.09164983 0.04847154
## 25 1e+03 4.0 0.09164983 0.04847154
best_model_rbf <- tune_result_rbf$best.model
summary(best_model_rbf)
##
## Call:
## best.tune(METHOD = svm, train.x = Class ~ ., data = train, ranges = list(cost = c(0.1,
## 1, 10, 100, 1000), gamma = c(0.5, 1, 2, 3, 4)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
##
## Number of Support Vectors: 207
##
## ( 44 163 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
# plot(best_model_rbf,train)
predictions_best_rbf <- predict(best_model_rbf, test)
confusion_matrix_best_rbf <- confusionMatrix(as.factor(predictions_best_rbf), as.factor(test$Class))
cat("Initial RBF Model Accuracy:", confusion_matrix_rbf$overall['Accuracy'], "\n")
## Initial RBF Model Accuracy: 0.9489051
cat("Tuned RBF Model Accuracy:", confusion_matrix_best_rbf$overall['Accuracy'], "\n")
## Tuned RBF Model Accuracy: 0.9562044
b/c. With cost = 10 and gamma = 0.5 in the best model, the accuracy is 0.9562044, which is not a substantial improvement compared to the base model.
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
# Function
rocplot <- function(pred, truth, main="ROC Curve", ...) {
predob = prediction(pred, truth)
perf = performance(predob, "tpr", "fpr")
plot(perf, main=main, ...)
}
# For the best linear kernel model
predictions_linear <- predict(svmfit_baseline, test, decision.values = TRUE)
attr_linear <- attributes(predictions_linear)
scores_linear <- attr_linear$decision.values
# For the best RBF kernel model
predictions_rbf <- predict(best_model_rbf, test, decision.values = TRUE)
attr_rbf <- attributes(predictions_rbf)
scores_rbf <- attr_rbf$decision.values
# ROC curves
rocplot(scores_linear, test$Class, main="ROC Curve: Linear Kernel")
rocplot(scores_rbf, test$Class, add=TRUE, col="red", main="ROC Curve Comparison")
legend("bottomright", legend=c("Linear Kernel", "RBF Kernel"), col=c("black", "red"), lwd=2)
The closer an ROC curve is to the top-left corner, the better the model’s performance. This position indicates a high TPR (sensitivity) and a low FPR at the same time. If one model’s ROC curve is consistently above the other across the FPR range, it can be considered superior. This indicates higher sensitivity for the same level of specificity. We can see that the linear kernel’s curve is always above RBF kernel’s one. Thus, the best linear kernel model is clearly the better model.