SVM Analysis of Drug Response Data
drug_data <- read.csv('https://raw.githubusercontent.com/Kingtilon1/MachineLearning-BigData/refs/heads/main/DecisionTree/drug200.csv')
head(drug_data)
## Age Sex BP Cholesterol Na_to_K Drug
## 1 23 F HIGH HIGH 25.355 drugY
## 2 47 M LOW HIGH 13.093 drugC
## 3 47 M LOW HIGH 10.114 drugC
## 4 28 F NORMAL HIGH 7.798 drugX
## 5 61 F LOW HIGH 18.043 drugY
## 6 22 F NORMAL HIGH 8.607 drugX
str(drug_data)
## 'data.frame': 200 obs. of 6 variables:
## $ Age : int 23 47 47 28 61 22 49 41 60 43 ...
## $ Sex : chr "F" "M" "M" "F" ...
## $ BP : chr "HIGH" "LOW" "LOW" "NORMAL" ...
## $ Cholesterol: chr "HIGH" "HIGH" "HIGH" "HIGH" ...
## $ Na_to_K : num 25.4 13.1 10.1 7.8 18 ...
## $ Drug : chr "drugY" "drugC" "drugC" "drugX" ...
skim(drug_data)
| Name | drug_data |
| Number of rows | 200 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 4 |
| numeric | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Sex | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| BP | 0 | 1 | 3 | 6 | 0 | 3 | 0 |
| Cholesterol | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| Drug | 0 | 1 | 5 | 5 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Age | 0 | 1 | 44.31 | 16.54 | 15.00 | 31.00 | 45.00 | 58.00 | 74.00 | ▆▆▇▆▆ |
| Na_to_K | 0 | 1 | 16.08 | 7.22 | 6.27 | 10.45 | 13.94 | 19.38 | 38.25 | ▇▆▂▂▁ |
drug_data$Sex <- as.factor(drug_data$Sex)
drug_data$BP <- as.factor(drug_data$BP)
drug_data$Cholesterol <- as.factor(drug_data$Cholesterol)
drug_data$Drug <- as.factor(drug_data$Drug)
set.seed(123)
train_index <- createDataPartition(drug_data$Drug, p = 0.8, list = FALSE)
train_data <- drug_data[train_index, ]
test_data <- drug_data[-train_index, ]
preproc <- preProcess(train_data[, c("Age", "Na_to_K")], method = c("center", "scale"))
train_data_scaled <- predict(preproc, train_data)
test_data_scaled <- predict(preproc, test_data)
svm_linear <- svm(Drug ~ ., data = train_data_scaled, kernel = "linear", cost = 1)
pred_linear <- predict(svm_linear, test_data_scaled)
conf_matrix_linear <- confusionMatrix(pred_linear, test_data_scaled$Drug)
print(conf_matrix_linear)
## Confusion Matrix and Statistics
##
## Reference
## Prediction drugA drugB drugC drugX drugY
## drugA 4 0 0 0 0
## drugB 0 2 0 0 0
## drugC 0 0 3 0 0
## drugX 0 0 0 9 0
## drugY 0 1 0 1 18
##
## Overall Statistics
##
## Accuracy : 0.9474
## 95% CI : (0.8225, 0.9936)
## No Information Rate : 0.4737
## P-Value [Acc > NIR] : 4.248e-10
##
## Kappa : 0.9211
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: drugA Class: drugB Class: drugC Class: drugX
## Sensitivity 1.0000 0.66667 1.00000 0.9000
## Specificity 1.0000 1.00000 1.00000 1.0000
## Pos Pred Value 1.0000 1.00000 1.00000 1.0000
## Neg Pred Value 1.0000 0.97222 1.00000 0.9655
## Prevalence 0.1053 0.07895 0.07895 0.2632
## Detection Rate 0.1053 0.05263 0.07895 0.2368
## Detection Prevalence 0.1053 0.05263 0.07895 0.2368
## Balanced Accuracy 1.0000 0.83333 1.00000 0.9500
## Class: drugY
## Sensitivity 1.0000
## Specificity 0.9000
## Pos Pred Value 0.9000
## Neg Pred Value 1.0000
## Prevalence 0.4737
## Detection Rate 0.4737
## Detection Prevalence 0.5263
## Balanced Accuracy 0.9500
svm_radial <- svm(Drug ~ ., data = train_data_scaled, kernel = "radial", cost = 1)
pred_radial <- predict(svm_radial, test_data_scaled)
conf_matrix_radial <- confusionMatrix(pred_radial, test_data_scaled$Drug)
print(conf_matrix_radial)
## Confusion Matrix and Statistics
##
## Reference
## Prediction drugA drugB drugC drugX drugY
## drugA 4 0 0 0 0
## drugB 0 1 0 0 0
## drugC 0 0 3 0 0
## drugX 0 0 0 10 0
## drugY 0 2 0 0 18
##
## Overall Statistics
##
## Accuracy : 0.9474
## 95% CI : (0.8225, 0.9936)
## No Information Rate : 0.4737
## P-Value [Acc > NIR] : 4.248e-10
##
## Kappa : 0.9205
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: drugA Class: drugB Class: drugC Class: drugX
## Sensitivity 1.0000 0.33333 1.00000 1.0000
## Specificity 1.0000 1.00000 1.00000 1.0000
## Pos Pred Value 1.0000 1.00000 1.00000 1.0000
## Neg Pred Value 1.0000 0.94595 1.00000 1.0000
## Prevalence 0.1053 0.07895 0.07895 0.2632
## Detection Rate 0.1053 0.02632 0.07895 0.2632
## Detection Prevalence 0.1053 0.02632 0.07895 0.2632
## Balanced Accuracy 1.0000 0.66667 1.00000 1.0000
## Class: drugY
## Sensitivity 1.0000
## Specificity 0.9000
## Pos Pred Value 0.9000
## Neg Pred Value 1.0000
## Prevalence 0.4737
## Detection Rate 0.4737
## Detection Prevalence 0.5263
## Balanced Accuracy 0.9500
tuning_grid <- expand.grid(
cost = c(0.1, 1, 10),
gamma = c(0.1, 1, 10)
)
svm_tune <- tune.svm(
Drug ~ .,
data = train_data_scaled,
kernel = "radial",
cost = c(0.1, 1, 10),
gamma = c(0.1, 1, 10)
)
print(svm_tune$best.parameters)
## gamma cost
## 7 0.1 10
svm_final <- svm(
Drug ~ .,
data = train_data_scaled,
kernel = "radial",
cost = svm_tune$best.parameters$cost,
gamma = svm_tune$best.parameters$gamma
)
pred_final <- predict(svm_final, test_data_scaled)
conf_matrix_final <- confusionMatrix(pred_final, test_data_scaled$Drug)
print(conf_matrix_final)
## Confusion Matrix and Statistics
##
## Reference
## Prediction drugA drugB drugC drugX drugY
## drugA 4 0 0 0 0
## drugB 0 3 0 0 0
## drugC 0 0 3 0 0
## drugX 0 0 0 10 0
## drugY 0 0 0 0 18
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9075, 1)
## No Information Rate : 0.4737
## P-Value [Acc > NIR] : 4.662e-13
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: drugA Class: drugB Class: drugC Class: drugX
## Sensitivity 1.0000 1.00000 1.00000 1.0000
## Specificity 1.0000 1.00000 1.00000 1.0000
## Pos Pred Value 1.0000 1.00000 1.00000 1.0000
## Neg Pred Value 1.0000 1.00000 1.00000 1.0000
## Prevalence 0.1053 0.07895 0.07895 0.2632
## Detection Rate 0.1053 0.07895 0.07895 0.2632
## Detection Prevalence 0.1053 0.07895 0.07895 0.2632
## Balanced Accuracy 1.0000 1.00000 1.00000 1.0000
## Class: drugY
## Sensitivity 1.0000
## Specificity 1.0000
## Pos Pred Value 1.0000
## Neg Pred Value 1.0000
## Prevalence 0.4737
## Detection Rate 0.4737
## Detection Prevalence 0.4737
## Balanced Accuracy 1.0000
importance <- data.frame(
Feature = names(train_data_scaled)[-which(names(train_data_scaled) == "Drug")],
Importance = 0
)
for(feature in importance$Feature) {
test_permuted <- test_data_scaled
test_permuted[,feature] <- sample(test_permuted[,feature])
pred_permuted <- predict(svm_final, test_permuted)
importance$Importance[importance$Feature == feature] <-
mean(pred_final == test_data_scaled$Drug) - mean(pred_permuted == test_data_scaled$Drug)
}
importance <- importance[order(-importance$Importance),]
ggplot(importance, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance in SVM Model",
x = "Features",
y = "Importance (Accuracy Drop)")
results_comparison <- data.frame(
Model = c("Decision Tree", "SVM Linear", "SVM Radial", "SVM Tuned"),
Accuracy = c(0.97,
conf_matrix_linear$overall["Accuracy"],
conf_matrix_radial$overall["Accuracy"],
conf_matrix_final$overall["Accuracy"]),
Kappa = c(0.96,
conf_matrix_linear$overall["Kappa"],
conf_matrix_radial$overall["Kappa"],
conf_matrix_final$overall["Kappa"])
)
print(results_comparison)
## Model Accuracy Kappa
## 1 Decision Tree 0.9700000 0.9600000
## 2 SVM Linear 0.9473684 0.9210800
## 3 SVM Radial 0.9473684 0.9205021
## 4 SVM Tuned 1.0000000 1.0000000