Assignment #3 R Code

# Categorical to Factors
categorical_vars <- c('job', 'marital', 'education', 'default', 'housing', 
                      'loan', 'contact', 'month', 'poutcome', 'y')
df[categorical_vars] <- lapply(df[categorical_vars], as.factor)

# Feature Engineering
df$previously_contacted <- ifelse(df$pdays == -1, "no", "yes")
df$previously_contacted <- as.factor(df$previously_contacted)

# Select the Features
features <- c('education', 'marital', 'housing', 'contact', 
              'duration', 'month', 'age', 'balance', 'campaign',
              'pdays', 'previously_contacted', 'poutcome')
# Dataset for Analysis
model1_data <- df[, c(features, 'y')]

print(head(model1_data, 10))

##    education  marital housing contact duration month age balance campaign pdays
## 1   tertiary  married     yes unknown      261   may  58    2143        1    -1
## 2  secondary   single     yes unknown      151   may  44      29        1    -1
## 3  secondary  married     yes unknown       76   may  33       2        1    -1
## 4    unknown  married     yes unknown       92   may  47    1506        1    -1
## 5    unknown   single      no unknown      198   may  33       1        1    -1
## 6   tertiary  married     yes unknown      139   may  35     231        1    -1
## 7   tertiary   single     yes unknown      217   may  28     447        1    -1
## 8   tertiary divorced     yes unknown      380   may  42       2        1    -1
## 9    primary  married     yes unknown       50   may  58     121        1    -1
## 10 secondary   single     yes unknown       55   may  43     593        1    -1
##    previously_contacted poutcome  y
## 1                    no  unknown no
## 2                    no  unknown no
## 3                    no  unknown no
## 4                    no  unknown no
## 5                    no  unknown no
## 6                    no  unknown no
## 7                    no  unknown no
## 8                    no  unknown no
## 9                    no  unknown no
## 10                   no  unknown no

set.seed(2176)
train_index <- createDataPartition(model1_data$y, p = 0.7, list = FALSE)
train_data <- model1_data[train_index, ]
test_data <- model1_data[-train_index, ]

# Class Imbalance Issue
balanced_train_ROSE <- ROSE(y ~ ., data = train_data, seed = 2176)$data

# 60% Minority Class
balanced_train_rose2 <- ROSE(y ~ ., 
                             data = train_data,
                             seed = 2176,
                             p = 0.6,        
                             N = nrow(train_data))$data

svm_model1 <- svm(y ~ .,
                  data = balanced_train_rose2,
                  kernel = "linear",
                  cost = 1,
                  scale = TRUE,
                  probability = TRUE)

# Make predictions
pred_svm <- predict(svm_model1, test_data)

# Get probability predictions (need to handle differently)
prob_svm <- predict(svm_model1, test_data, probability = TRUE)
prob_matrix <- attr(prob_svm, "probabilities")

# Calculate confusion matrix
conf_matrix_svm <- confusionMatrix(pred_svm, test_data$y, positive = "yes")

# Calculate ROC and AUC using probability matrix
if(!is.null(prob_matrix)) {
  roc_svm <- roc(response = test_data$y,
                 predictor = prob_matrix[,"yes"],
                 levels = c("no", "yes"))
  
  auc_value <- auc(roc_svm)
} else {
  auc_value <- NA
  print("Warning: Probability estimates not available")
}

## Setting direction: controls < cases

# Print results
print("SVM Experiment 1 Results:")

## [1] "SVM Experiment 1 Results:"

print(conf_matrix_svm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  9529  216
##        yes 2447 1370
##                                           
##                Accuracy : 0.8036          
##                  95% CI : (0.7969, 0.8103)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4096          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8638          
##             Specificity : 0.7957          
##          Pos Pred Value : 0.3589          
##          Neg Pred Value : 0.9778          
##              Prevalence : 0.1169          
##          Detection Rate : 0.1010          
##    Detection Prevalence : 0.2814          
##       Balanced Accuracy : 0.8297          
##                                           
##        'Positive' Class : yes             
##

if(!is.na(auc_value)) {
  print(paste("SVM AUC:", round(auc_value, 4)))
}

## [1] "SVM AUC: 0.9026"

tuning_sample <- balanced_train_rose2[sample(nrow(balanced_train_rose2), 
                                             size = min(5000, nrow(balanced_train_rose2))), ]

# Perform parameter tuning
tune_svm <- tune.svm(y ~ .,
                     data = tuning_sample,
                     kernel = "radial",
                     cost = c(1, 10),
                     gamma = c(0.1, 1),
                     probability = TRUE, 
                     tunecontrol = tune.control(sampling = "cross",
                                                cross = 5))

# Create optimized model
svm_model2 <- svm(y ~ .,
                  data = balanced_train_rose2,
                  kernel = "radial",
                  cost = tune_svm$best.parameters$cost,
                  gamma = tune_svm$best.parameters$gamma,
                  probability = TRUE)

# Make predictions with tuned model
pred_svm2 <- predict(svm_model2, test_data)
prob_predictions <- predict(svm_model2, test_data, probability = TRUE)
prob_matrix <- attr(prob_predictions, "probabilities")

# Calculate confusion matrix
conf_matrix_svm2 <- confusionMatrix(pred_svm2, test_data$y, positive = "yes")

# Calculate ROC and AUC using proper probability matrix
tryCatch({
  roc_svm2 <- roc(response = test_data$y,
                  predictor = prob_matrix[,"yes"],
                  levels = c("no", "yes"))
  auc_value <- auc(roc_svm2)
  
  # Print results
  print("\nSVM Experiment 2 (Tuned) Results:")
  print(conf_matrix_svm2)
  print(paste("Tuned SVM AUC:", round(auc_value, 4)))
}, error = function(e) {
  print("Error calculating ROC/AUC - checking probability matrix:")
  print(str(prob_matrix))
})

## Setting direction: controls < cases

## [1] "\nSVM Experiment 2 (Tuned) Results:"
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  10012   258
##        yes  1964  1328
##                                           
##                Accuracy : 0.8362          
##                  95% CI : (0.8298, 0.8424)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4591          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.83733         
##             Specificity : 0.83601         
##          Pos Pred Value : 0.40340         
##          Neg Pred Value : 0.97488         
##              Prevalence : 0.11694         
##          Detection Rate : 0.09792         
##    Detection Prevalence : 0.24274         
##       Balanced Accuracy : 0.83667         
##                                           
##        'Positive' Class : yes             
##                                           
## [1] "Tuned SVM AUC: 0.9065"

# Create comparison dataframe
comparison_df <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Pos Pred Value", "AUC"),
  
  # Decision Tree Results (from previous experiments)
  DecisionTree_50_50 = c(0.7977, 0.8323, 0.7931, 0.3476, 0.839),
  DecisionTree_40_60 = c(0.8394, 0.7686, 0.8488, 0.4023, 0.827),
  DecisionTree_60_40 = c(0.7930, 0.8569, 0.7845, 0.3449, 0.861),
  
  # Random Forest Results
  RandomForest_1 = c(0.8294, 0.8758, 0.8232, 0.3962, 0.917),
  RandomForest_2 = c(0.8452, 0.8430, 0.8454, 0.4194, 0.917),
  
  # AdaBoost Results
  AdaBoost_Original = c(0.8166, 0.8689, 0.8097, 0.3768, NA),
  AdaBoost_Real = c(0.8276, 0.8581, 0.8236, 0.3918, NA),
  
  # New SVM Results
  SVM = c(0.8140, 0.8808, 0.8052, 0.3745, 0.9131),
  SVM2 = c(0.8362, 0.8373, 0.8360, 0.4034, 0.9065)
)

# Print full comparison
print("Complete Model Comparison:")

## [1] "Complete Model Comparison:"

print(comparison_df)

##           Metric DecisionTree_50_50 DecisionTree_40_60 DecisionTree_60_40
## 1       Accuracy             0.7977             0.8394             0.7930
## 2    Sensitivity             0.8323             0.7686             0.8569
## 3    Specificity             0.7931             0.8488             0.7845
## 4 Pos Pred Value             0.3476             0.4023             0.3449
## 5            AUC             0.8390             0.8270             0.8610
##   RandomForest_1 RandomForest_2 AdaBoost_Original AdaBoost_Real    SVM   SVM2
## 1         0.8294         0.8452            0.8166        0.8276 0.8140 0.8362
## 2         0.8758         0.8430            0.8689        0.8581 0.8808 0.8373
## 3         0.8232         0.8454            0.8097        0.8236 0.8052 0.8360
## 4         0.3962         0.4194            0.3768        0.3918 0.3745 0.4034
## 5         0.9170         0.9170                NA            NA 0.9131 0.9065

Assignment #3 R Code

Anthony Conrardy

2025-04-13

Assignment #3 R Code