Assignment #3 R Code
# Categorical to Factors
categorical_vars <- c('job', 'marital', 'education', 'default', 'housing',
'loan', 'contact', 'month', 'poutcome', 'y')
df[categorical_vars] <- lapply(df[categorical_vars], as.factor)
# Feature Engineering
df$previously_contacted <- ifelse(df$pdays == -1, "no", "yes")
df$previously_contacted <- as.factor(df$previously_contacted)
# Select the Features
features <- c('education', 'marital', 'housing', 'contact',
'duration', 'month', 'age', 'balance', 'campaign',
'pdays', 'previously_contacted', 'poutcome')
# Dataset for Analysis
model1_data <- df[, c(features, 'y')]
print(head(model1_data, 10))
## education marital housing contact duration month age balance campaign pdays
## 1 tertiary married yes unknown 261 may 58 2143 1 -1
## 2 secondary single yes unknown 151 may 44 29 1 -1
## 3 secondary married yes unknown 76 may 33 2 1 -1
## 4 unknown married yes unknown 92 may 47 1506 1 -1
## 5 unknown single no unknown 198 may 33 1 1 -1
## 6 tertiary married yes unknown 139 may 35 231 1 -1
## 7 tertiary single yes unknown 217 may 28 447 1 -1
## 8 tertiary divorced yes unknown 380 may 42 2 1 -1
## 9 primary married yes unknown 50 may 58 121 1 -1
## 10 secondary single yes unknown 55 may 43 593 1 -1
## previously_contacted poutcome y
## 1 no unknown no
## 2 no unknown no
## 3 no unknown no
## 4 no unknown no
## 5 no unknown no
## 6 no unknown no
## 7 no unknown no
## 8 no unknown no
## 9 no unknown no
## 10 no unknown no
set.seed(2176)
train_index <- createDataPartition(model1_data$y, p = 0.7, list = FALSE)
train_data <- model1_data[train_index, ]
test_data <- model1_data[-train_index, ]
# Class Imbalance Issue
balanced_train_ROSE <- ROSE(y ~ ., data = train_data, seed = 2176)$data
# 60% Minority Class
balanced_train_rose2 <- ROSE(y ~ .,
data = train_data,
seed = 2176,
p = 0.6,
N = nrow(train_data))$data
svm_model1 <- svm(y ~ .,
data = balanced_train_rose2,
kernel = "linear",
cost = 1,
scale = TRUE,
probability = TRUE)
# Make predictions
pred_svm <- predict(svm_model1, test_data)
# Get probability predictions (need to handle differently)
prob_svm <- predict(svm_model1, test_data, probability = TRUE)
prob_matrix <- attr(prob_svm, "probabilities")
# Calculate confusion matrix
conf_matrix_svm <- confusionMatrix(pred_svm, test_data$y, positive = "yes")
# Calculate ROC and AUC using probability matrix
if(!is.null(prob_matrix)) {
roc_svm <- roc(response = test_data$y,
predictor = prob_matrix[,"yes"],
levels = c("no", "yes"))
auc_value <- auc(roc_svm)
} else {
auc_value <- NA
print("Warning: Probability estimates not available")
}
## Setting direction: controls < cases
# Print results
print("SVM Experiment 1 Results:")
## [1] "SVM Experiment 1 Results:"
print(conf_matrix_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 9529 216
## yes 2447 1370
##
## Accuracy : 0.8036
## 95% CI : (0.7969, 0.8103)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4096
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8638
## Specificity : 0.7957
## Pos Pred Value : 0.3589
## Neg Pred Value : 0.9778
## Prevalence : 0.1169
## Detection Rate : 0.1010
## Detection Prevalence : 0.2814
## Balanced Accuracy : 0.8297
##
## 'Positive' Class : yes
##
if(!is.na(auc_value)) {
print(paste("SVM AUC:", round(auc_value, 4)))
}
## [1] "SVM AUC: 0.9026"
tuning_sample <- balanced_train_rose2[sample(nrow(balanced_train_rose2),
size = min(5000, nrow(balanced_train_rose2))), ]
# Perform parameter tuning
tune_svm <- tune.svm(y ~ .,
data = tuning_sample,
kernel = "radial",
cost = c(1, 10),
gamma = c(0.1, 1),
probability = TRUE,
tunecontrol = tune.control(sampling = "cross",
cross = 5))
# Create optimized model
svm_model2 <- svm(y ~ .,
data = balanced_train_rose2,
kernel = "radial",
cost = tune_svm$best.parameters$cost,
gamma = tune_svm$best.parameters$gamma,
probability = TRUE)
# Make predictions with tuned model
pred_svm2 <- predict(svm_model2, test_data)
prob_predictions <- predict(svm_model2, test_data, probability = TRUE)
prob_matrix <- attr(prob_predictions, "probabilities")
# Calculate confusion matrix
conf_matrix_svm2 <- confusionMatrix(pred_svm2, test_data$y, positive = "yes")
# Calculate ROC and AUC using proper probability matrix
tryCatch({
roc_svm2 <- roc(response = test_data$y,
predictor = prob_matrix[,"yes"],
levels = c("no", "yes"))
auc_value <- auc(roc_svm2)
# Print results
print("\nSVM Experiment 2 (Tuned) Results:")
print(conf_matrix_svm2)
print(paste("Tuned SVM AUC:", round(auc_value, 4)))
}, error = function(e) {
print("Error calculating ROC/AUC - checking probability matrix:")
print(str(prob_matrix))
})
## Setting direction: controls < cases
## [1] "\nSVM Experiment 2 (Tuned) Results:"
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 10012 258
## yes 1964 1328
##
## Accuracy : 0.8362
## 95% CI : (0.8298, 0.8424)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4591
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.83733
## Specificity : 0.83601
## Pos Pred Value : 0.40340
## Neg Pred Value : 0.97488
## Prevalence : 0.11694
## Detection Rate : 0.09792
## Detection Prevalence : 0.24274
## Balanced Accuracy : 0.83667
##
## 'Positive' Class : yes
##
## [1] "Tuned SVM AUC: 0.9065"
# Create comparison dataframe
comparison_df <- data.frame(
Metric = c("Accuracy", "Sensitivity", "Specificity", "Pos Pred Value", "AUC"),
# Decision Tree Results (from previous experiments)
DecisionTree_50_50 = c(0.7977, 0.8323, 0.7931, 0.3476, 0.839),
DecisionTree_40_60 = c(0.8394, 0.7686, 0.8488, 0.4023, 0.827),
DecisionTree_60_40 = c(0.7930, 0.8569, 0.7845, 0.3449, 0.861),
# Random Forest Results
RandomForest_1 = c(0.8294, 0.8758, 0.8232, 0.3962, 0.917),
RandomForest_2 = c(0.8452, 0.8430, 0.8454, 0.4194, 0.917),
# AdaBoost Results
AdaBoost_Original = c(0.8166, 0.8689, 0.8097, 0.3768, NA),
AdaBoost_Real = c(0.8276, 0.8581, 0.8236, 0.3918, NA),
# New SVM Results
SVM = c(0.8140, 0.8808, 0.8052, 0.3745, 0.9131),
SVM2 = c(0.8362, 0.8373, 0.8360, 0.4034, 0.9065)
)
# Print full comparison
print("Complete Model Comparison:")
## [1] "Complete Model Comparison:"
print(comparison_df)
## Metric DecisionTree_50_50 DecisionTree_40_60 DecisionTree_60_40
## 1 Accuracy 0.7977 0.8394 0.7930
## 2 Sensitivity 0.8323 0.7686 0.8569
## 3 Specificity 0.7931 0.8488 0.7845
## 4 Pos Pred Value 0.3476 0.4023 0.3449
## 5 AUC 0.8390 0.8270 0.8610
## RandomForest_1 RandomForest_2 AdaBoost_Original AdaBoost_Real SVM SVM2
## 1 0.8294 0.8452 0.8166 0.8276 0.8140 0.8362
## 2 0.8758 0.8430 0.8689 0.8581 0.8808 0.8373
## 3 0.8232 0.8454 0.8097 0.8236 0.8052 0.8360
## 4 0.3962 0.4194 0.3768 0.3918 0.3745 0.4034
## 5 0.9170 0.9170 NA NA 0.9131 0.9065