Exploratory Data analysis

## Rows: 4,521
## Columns: 17
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, …
## $ job       <chr> "unemployed", "services", "management", "management", "blue-…
## $ marital   <chr> "married", "married", "single", "married", "married", "singl…
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondary",…
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26…
## $ housing   <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",…
## $ loan      <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "yes…
## $ contact   <chr> "cellular", "cellular", "cellular", "unknown", "unknown", "c…
## $ day       <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29,…
## $ month     <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "may…
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32…
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, …
## $ pdays     <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1,…
## $ previous  <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, …
## $ poutcome  <chr> "unknown", "failure", "failure", "unknown", "unknown", "fail…
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …

#convert categorical variables to a factor

bank <- bank %>%
  mutate(y = factor(y, levels = c("no", "yes")),  
         job = as.factor(job), 
         marital = as.factor(marital),
         education = as.factor(education),
         default = as.factor(default),
         housing = as.factor(housing),
         loan = as.factor(loan),
         contact = as.factor(contact),
         month = as.factor(month),
         poutcome = as.factor(poutcome),
        job  = as.factor(job))

#Summary statistics

summary(bank)

##       age                 job          marital         education    default   
##  Min.   :19.00   management :969   divorced: 528   primary  : 678   no :4445  
##  1st Qu.:33.00   blue-collar:946   married :2797   secondary:2306   yes:  76  
##  Median :39.00   technician :768   single  :1196   tertiary :1350             
##  Mean   :41.17   admin.     :478                   unknown  : 187             
##  3rd Qu.:49.00   services   :417                                              
##  Max.   :87.00   retired    :230                                              
##                  (Other)    :713                                              
##     balance      housing     loan           contact          day       
##  Min.   :-3313   no :1962   no :3830   cellular :2896   Min.   : 1.00  
##  1st Qu.:   69   yes:2559   yes: 691   telephone: 301   1st Qu.: 9.00  
##  Median :  444                         unknown  :1324   Median :16.00  
##  Mean   : 1423                                          Mean   :15.92  
##  3rd Qu.: 1480                                          3rd Qu.:21.00  
##  Max.   :71188                                          Max.   :31.00  
##                                                                        
##      month         duration       campaign          pdays       
##  may    :1398   Min.   :   4   Min.   : 1.000   Min.   : -1.00  
##  jul    : 706   1st Qu.: 104   1st Qu.: 1.000   1st Qu.: -1.00  
##  aug    : 633   Median : 185   Median : 2.000   Median : -1.00  
##  jun    : 531   Mean   : 264   Mean   : 2.794   Mean   : 39.77  
##  nov    : 389   3rd Qu.: 329   3rd Qu.: 3.000   3rd Qu.: -1.00  
##  apr    : 293   Max.   :3025   Max.   :50.000   Max.   :871.00  
##  (Other): 571                                                   
##     previous          poutcome      y       
##  Min.   : 0.0000   failure: 490   no :4000  
##  1st Qu.: 0.0000   other  : 197   yes: 521  
##  Median : 0.0000   success: 129             
##  Mean   : 0.5426   unknown:3705             
##  3rd Qu.: 0.0000                            
##  Max.   :25.0000                            
##

n_bank <- bank %>%
  select(where(is.numeric))
  
summary(n_bank)

##       age           balance           day           duration   
##  Min.   :19.00   Min.   :-3313   Min.   : 1.00   Min.   :   4  
##  1st Qu.:33.00   1st Qu.:   69   1st Qu.: 9.00   1st Qu.: 104  
##  Median :39.00   Median :  444   Median :16.00   Median : 185  
##  Mean   :41.17   Mean   : 1423   Mean   :15.92   Mean   : 264  
##  3rd Qu.:49.00   3rd Qu.: 1480   3rd Qu.:21.00   3rd Qu.: 329  
##  Max.   :87.00   Max.   :71188   Max.   :31.00   Max.   :3025  
##     campaign          pdays           previous      
##  Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000  
##  1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000  
##  Median : 2.000   Median : -1.00   Median : 0.0000  
##  Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426  
##  3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000  
##  Max.   :50.000   Max.   :871.00   Max.   :25.0000

c_bank <- bank %>% select(-where(is.numeric))
summary(c_bank)

##           job          marital         education    default    housing   
##  management :969   divorced: 528   primary  : 678   no :4445   no :1962  
##  blue-collar:946   married :2797   secondary:2306   yes:  76   yes:2559  
##  technician :768   single  :1196   tertiary :1350                        
##  admin.     :478                   unknown  : 187                        
##  services   :417                                                         
##  retired    :230                                                         
##  (Other)    :713                                                         
##   loan           contact         month         poutcome      y       
##  no :3830   cellular :2896   may    :1398   failure: 490   no :4000  
##  yes: 691   telephone: 301   jul    : 706   other  : 197   yes: 521  
##             unknown  :1324   aug    : 633   success: 129             
##                              jun    : 531   unknown:3705             
##                              nov    : 389                            
##                              apr    : 293                            
##                              (Other): 571

colSums(is.na(bank))

##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  previous  poutcome 
##         0         0         0         0         0         0         0         0 
##         y 
##         0

#numeric distributions

n_bank %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Numeric Variables")

c_bank %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
  ggplot(aes(x = value)) +
  geom_bar(fill = "blue", alpha = 0.7) +
  facet_wrap(~variable, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Categorical Variables") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

n_bank <- n_bank %>%
  mutate(y = bank$y)  


n_bank <- n_bank %>%
  mutate(y = as.factor(y))


n_bank_long <- n_bank %>%
  pivot_longer(cols = -y, names_to = "variable", values_to = "value")


ggplot(n_bank_long, aes(x = y, y = value, fill = y)) +
  geom_boxplot() +
  facet_wrap(~variable, scales = "free") +
  labs(title = "Distribution of Numeric Variables by Yes/No in Y",
       x = "Y (Outcome)", y = "Value") +
  theme_minimal()

c_bank <- c_bank %>%
  mutate(y = bank$y) %>%
  mutate(y = as.factor(y))  


c_bank_long <- c_bank %>%
  pivot_longer(cols = -y, names_to = "variable", values_to = "value") %>%
  count(variable, value, y)  


ggplot(c_bank_long, aes(x = value, y = n, fill = y)) +  
  geom_bar(stat = "identity", position = "dodge", alpha = 0.7) + 
  facet_wrap(~variable, scales = "free") +  
  labs(title = "Relationship Between Categorical Variables and Y (Yes/No)",
       x = "Category", y = "Count") +
  scale_fill_manual(values = c("yes" = "blue", "no" = "red")) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

n_bank <- n_bank %>%
  mutate(y = as.numeric(as.factor(y)) - 1) 


cor_matrix <- cor(n_bank, use = "complete.obs")


ggcorrplot(cor_matrix, 
           method = "circle",  
           type = "lower",      
           lab = TRUE,       
           lab_size = 3,        
           colors = c("blue", "white", "red"), 
           title = "Correlation Heatmap",
           ggtheme = theme_minimal())

Experiment

Decision trees models

Partition

80% of the data will be used for training and 20% will be used for testing

set.seed(1234)
sample <- sample(nrow(bank), round(nrow(bank)*.8),
                 replace = FALSE)

bank_train <- bank[sample,]
bank_test <- bank[-sample,]

round(prop.table(table(select(bank, y))),2)

## y
##   no  yes 
## 0.88 0.12

round(prop.table(table(select(bank_train, y))),2)

## y
##   no  yes 
## 0.88 0.12

round(prop.table(table(select(bank_test, y))),2)

## y
##   no  yes 
## 0.89 0.11

Model 1

First model includes all variables and we will split the data 80/20. We want to understand which variables are significant

bank_mod <-
  rpart(
    y ~ .,
    method = "class",
    data = bank_train
 )
rpart.plot(bank_mod)

bank_pred <- predict(bank_mod, bank_test, type = "class")
bank_pred_table <- table(bank_test$y, bank_pred)
bank_pred_table

##      bank_pred
##        no yes
##   no  774  29
##   yes  74  27

sum(diag(bank_pred_table)) / nrow(bank_test)

## [1] 0.8860619

cm <- confusionMatrix(data = bank_pred, reference = bank_test$y, positive = "yes")
cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  774  74
##        yes  29  27
##                                          
##                Accuracy : 0.8861         
##                  95% CI : (0.8635, 0.906)
##     No Information Rate : 0.8883         
##     P-Value [Acc > NIR] : 0.609          
##                                          
##                   Kappa : 0.2871         
##                                          
##  Mcnemar's Test P-Value : 1.455e-05      
##                                          
##             Sensitivity : 0.26733        
##             Specificity : 0.96389        
##          Pos Pred Value : 0.48214        
##          Neg Pred Value : 0.91274        
##              Prevalence : 0.11173        
##          Detection Rate : 0.02987        
##    Detection Prevalence : 0.06195        
##       Balanced Accuracy : 0.61561        
##                                          
##        'Positive' Class : yes            
##

sensitivity is very low, only 27% of successes were detected. Model predicts yes correct less than 50% of the time.

Model 2

balanced data using all variables Use Rose to over sample the yes class Hypothesis: A balanced data will improve the accuracy, Kappa, and Pos prediction value

data_balanced <- ROSE(y ~ ., data = bank, seed = 124)$data


table(data_balanced$y)

## 
##   no  yes 
## 2313 2208

set.seed(124) 
trainIndex <- createDataPartition(data_balanced$y, p = 0.8, list = FALSE)
bank_train2 <- data_balanced[trainIndex, ]
bank_test2 <- data_balanced[-trainIndex, ]

bank_mod2 <-
  rpart(
    y ~ .,
    method = "class",
    data = bank_train2
 )
rpart.plot(bank_mod2)

bank_pred2 <- predict(bank_mod2, bank_test2, type = "class")

cm2 <- confusionMatrix(data = bank_pred2, reference = bank_test2$y, positive = "yes")
cm2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  359  82
##        yes 103 359
##                                          
##                Accuracy : 0.7951         
##                  95% CI : (0.7673, 0.821)
##     No Information Rate : 0.5116         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.5905         
##                                          
##  Mcnemar's Test P-Value : 0.1414         
##                                          
##             Sensitivity : 0.8141         
##             Specificity : 0.7771         
##          Pos Pred Value : 0.7771         
##          Neg Pred Value : 0.8141         
##              Prevalence : 0.4884         
##          Detection Rate : 0.3976         
##    Detection Prevalence : 0.5116         
##       Balanced Accuracy : 0.7956         
##                                          
##        'Positive' Class : yes            
##

after balancing the data, the models sensitivity improves 4 times the previous model. Kappa - .56 Sensitivity - .81 ## Random forest

Model 3

Random forest using the balanced data and significant factors; Job had very low importance. Accuracy, specificity, and Kappa remain high. Kappa : 0.6037
Sensitivity : 0.8186
Specificity : 0.7857

bank_train2$y <- factor(bank_train2$y, levels = c("yes", "no"))
bank_test2$y <- factor(bank_test2$y, levels = c("yes", "no"))
set.seed(126)  
rf_model <- randomForest(y ~ duration + poutcome + job + month , data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)

print(rf_model)

## 
## Call:
##  randomForest(formula = y ~ duration + poutcome + job + month,      data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) -          1), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 19.65%
## Confusion matrix:
##      yes   no class.error
## yes 1420  347   0.1963780
## no   364 1487   0.1966505

importance(rf_model)

##                yes        no MeanDecreaseAccuracy MeanDecreaseGini
## duration 174.23918 150.86938             221.1754        1037.9661
## poutcome  95.83095  97.54879             133.3147         181.0547
## job       71.06409  20.08557              69.0880         223.5180
## month    105.64045  81.29794             131.0640         364.9532

varImpPlot(rf_model)

predictions <- predict(rf_model, bank_test2, type = "class")
confusionMatrix(table(predictions, bank_test2$y))

## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes 361  99
##         no   80 363
##                                           
##                Accuracy : 0.8018          
##                  95% CI : (0.7742, 0.8273)
##     No Information Rate : 0.5116          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6037          
##                                           
##  Mcnemar's Test P-Value : 0.1785          
##                                           
##             Sensitivity : 0.8186          
##             Specificity : 0.7857          
##          Pos Pred Value : 0.7848          
##          Neg Pred Value : 0.8194          
##              Prevalence : 0.4884          
##          Detection Rate : 0.3998          
##    Detection Prevalence : 0.5094          
##       Balanced Accuracy : 0.8022          
##                                           
##        'Positive' Class : yes             
##

For our next model we will go back to using all of the variables to understand which are important to predict a success in subcriptions to a term deposit.

Model 4

Balanced data with all variables. The error rate for yes misclassifications decreased by a couple of percentage points. The Kappa improves to 72%, meaning that the model’s prediction agrees with the actual outcomes very well when accounting for the possibility of agreement by chance. Month along with duration is a significant factor as in the other models.

set.seed(127)  
rf_model2 <- randomForest(y ~ . , data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)

print(rf_model2)

## 
## Call:
##  randomForest(formula = y ~ ., data = bank_train2, ntree = 500,      mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 12.6%
## Confusion matrix:
##      yes   no class.error
## yes 1565  202   0.1143181
## no   254 1597   0.1372231

importance(rf_model2)

##                  yes         no MeanDecreaseAccuracy MeanDecreaseGini
## age        19.820523  21.572753            29.700444       103.723779
## job        49.632793  17.263917            49.445207       126.675076
## marital    24.374131  10.114840            25.286746        28.699770
## education  18.927340   9.791511            21.902100        31.341975
## default     6.901647   2.935399             6.525886         2.891009
## balance    20.221959   8.092355            19.457560       102.214957
## housing    21.785807  11.726264            22.117810        20.989088
## loan       19.174621   8.614734            19.445217        15.450295
## contact    25.387859  32.874001            36.256648        44.063995
## day        18.308964  19.305632            26.024022       103.066523
## month      66.603151  61.593286            83.421355       221.760095
## duration  132.482029 133.421898           158.444418       514.816303
## campaign   25.589388   5.854974            23.598160       101.715509
## pdays       5.182782  34.678797            36.324847       129.501015
## previous   18.917044  42.631203            44.014716       164.431657
## poutcome   24.525720  40.771142            50.914476        94.491496

varImpPlot(rf_model2)

predictions <- predict(rf_model2, bank_test2, type = "class")
confusionMatrix(table(predictions, bank_test2$y))

## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes 388  71
##         no   53 391
##                                           
##                Accuracy : 0.8627          
##                  95% CI : (0.8385, 0.8845)
##     No Information Rate : 0.5116          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7255          
##                                           
##  Mcnemar's Test P-Value : 0.1268          
##                                           
##             Sensitivity : 0.8798          
##             Specificity : 0.8463          
##          Pos Pred Value : 0.8453          
##          Neg Pred Value : 0.8806          
##              Prevalence : 0.4884          
##          Detection Rate : 0.4297          
##    Detection Prevalence : 0.5083          
##       Balanced Accuracy : 0.8631          
##                                           
##        'Positive' Class : yes             
##

Adaboost

Model 5

5 k-fold cross validation 500 trees Hypothesis: remove bias and overfitting

set.seed(128)
ada_model <- train(y ~ duration + month + previous + poutcome , data = bank_train2, method = "ada", trControl = trainControl(method = "cv", number = 5))


print(ada_model)

## Boosted Classification Trees 
## 
## 3618 samples
##    4 predictor
##    2 classes: 'yes', 'no' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 2894, 2894, 2895, 2895, 2894 
## Resampling results across tuning parameters:
## 
##   maxdepth  iter  Accuracy   Kappa     
##   1          50   0.2636857  -0.4647131
##   1         100   0.2426797  -0.5084060
##   1         150   0.2363227  -0.5214948
##   2          50   0.2260945  -0.5425801
##   2         100   0.2222259  -0.5507751
##   2         150   0.2175256  -0.5606495
##   3          50   0.2147609  -0.5659306
##   3         100   0.2059165  -0.5846610
##   3         150   0.2017694  -0.5935630
## 
## Tuning parameter 'nu' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 50, maxdepth = 1 and nu = 0.1.

Month showed no importance

pred <- predict(ada_model, newdata = bank_test2, type = "raw")  
confusionMatrix(table(pred, bank_test2$y))

## Confusion Matrix and Statistics
## 
##      
## pred  yes  no
##   yes 182 394
##   no  259  68
##                                           
##                Accuracy : 0.2769          
##                  95% CI : (0.2479, 0.3073)
##     No Information Rate : 0.5116          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.4371         
##                                           
##  Mcnemar's Test P-Value : 1.573e-07       
##                                           
##             Sensitivity : 0.4127          
##             Specificity : 0.1472          
##          Pos Pred Value : 0.3160          
##          Neg Pred Value : 0.2080          
##              Prevalence : 0.4884          
##          Detection Rate : 0.2016          
##    Detection Prevalence : 0.6379          
##       Balanced Accuracy : 0.2799          
##                                           
##        'Positive' Class : yes             
##

plot(varImp(ada_model))

Accuracy : 0.2769 Sensitivity : 0.4127 Kappa : -0.4371
results worse than random prediction ### Model 6 increase K-folds to reduce variance and get a better performance estimate

set.seed(129)

ada_model2 <- train(y ~ . - job, 
                   data = bank_train2, 
                   method = "ada", 
                   trControl = trainControl(method = "cv", number = 20))

pred <- predict(ada_model2, newdata = bank_test2, type = "raw")  
confusionMatrix(table(pred, bank_test2$y))

## Confusion Matrix and Statistics
## 
##      
## pred  yes  no
##   yes 171 387
##   no  270  75
##                                           
##                Accuracy : 0.2724          
##                  95% CI : (0.2436, 0.3027)
##     No Information Rate : 0.5116          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.4472         
##                                           
##  Mcnemar's Test P-Value : 6.023e-06       
##                                           
##             Sensitivity : 0.3878          
##             Specificity : 0.1623          
##          Pos Pred Value : 0.3065          
##          Neg Pred Value : 0.2174          
##              Prevalence : 0.4884          
##          Detection Rate : 0.1894          
##    Detection Prevalence : 0.6179          
##       Balanced Accuracy : 0.2750          
##                                           
##        'Positive' Class : yes             
##

plot(varImp(ada_model2))

Similar results as the first adaboost model.

model_results <- tibble(
  Model        = c("Decision tree (bank_mod)", "Decision tree (bank_mod2)", "Random Forest (rf_model)","Random Forest (rf_model2)", "Adaboost (ada_model)", "Adaboos (ada_model2"),
  Accuracy     = c(0.8861, 0.7951, 0.8018, 0.8627, 0.2769, 0.2724),  
  Sensitivity  = c(0.2673, 0.8141, 0.8186, 0.8798, 0.4127, 0.3878),
  Specificity  = c(0.96389,0.7771,0.7857, 0.8463 ,0.1472,0.1623),
  Kappa        = c(0.2871, 0.59, 0.6037, 0.7255, -0.4371, -0.4472)  
)


print(model_results)

## # A tibble: 6 × 5
##   Model                     Accuracy Sensitivity Specificity  Kappa
##   <chr>                        <dbl>       <dbl>       <dbl>  <dbl>
## 1 Decision tree (bank_mod)     0.886       0.267       0.964  0.287
## 2 Decision tree (bank_mod2)    0.795       0.814       0.777  0.59 
## 3 Random Forest (rf_model)     0.802       0.819       0.786  0.604
## 4 Random Forest (rf_model2)    0.863       0.880       0.846  0.726
## 5 Adaboost (ada_model)         0.277       0.413       0.147 -0.437
## 6 Adaboos (ada_model2          0.272       0.388       0.162 -0.447

Data 622 Assignment 2

Angel Gallardo

2025-03-22