ML - Ex4

Eldad Aviv: 206836165

Pre Process

Load Libraries

library(ggplot2)
library(rsample)
library(recipes)
library(caret)
library(yardstick)
library(kernlab)

Load Data

data("Wage", package = "ISLR")

Display Data

head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
plot(Wage$wage)

It appears as there is a distinct class of high wage in our data. thus, we will try to predict the highest quartile with the following variables:

  1. Age
  2. Race
  3. Education
  4. Job Class

Recode Data

# Calculate quartiles
quartiles <- quantile(Wage$wage, probs = c(0.25, 0.50, 0.75, 1.00))

# Assign  Q1, Q2 & Q3 as low, and Q4 as high
Wage$wage_Q <- ifelse(Wage$wage <= quartiles[3], "low", "high")

Convert Variables to Factors

Wage$race <- as.factor(Wage$race)
Wage$education <- as.factor(Wage$education)
Wage$jobclass <- as.factor(Wage$jobclass)
Wage$wage_Q <- as.factor(Wage$wage_Q)

Split Data

set.seed(1)
splits <- initial_split(Wage, prop = 0.8)
Wage.train <- training(splits)
Wage.test <- testing(splits)

Model Configurations

# Predict wage_Q with Age + Race + Education + Job Class
rec <- recipe(wage_Q ~ age + race + education + jobclass, 
              data = Wage.train) |> 
  step_dummy(all_nominal_predictors()) |> # Convert factors to dummy 
  step_range(all_numeric_predictors()) # Apply min-max normalization

Train Model

Train Control

Use 5-folds Cross Validation

tc <- trainControl(method = "cv", number = 5)

Support Vector Classifier

Tune Cost Parameter

  • In this logarithmic configuration, lower values of C(closer to 0.001) are more densely sampled compared to higher values (closer to 1).
# This grid will test 31 different values of cost parameter C between 0.001 and 1
tg <- expand.grid(
  C = 10 ^ seq(-3, 0, by = 0.1) # [0, 1]
)

tg
##              C
## 1  0.001000000
## 2  0.001258925
## 3  0.001584893
## 4  0.001995262
## 5  0.002511886
## 6  0.003162278
## 7  0.003981072
## 8  0.005011872
## 9  0.006309573
## 10 0.007943282
## 11 0.010000000
## 12 0.012589254
## 13 0.015848932
## 14 0.019952623
## 15 0.025118864
## 16 0.031622777
## 17 0.039810717
## 18 0.050118723
## 19 0.063095734
## 20 0.079432823
## 21 0.100000000
## 22 0.125892541
## 23 0.158489319
## 24 0.199526231
## 25 0.251188643
## 26 0.316227766
## 27 0.398107171
## 28 0.501187234
## 29 0.630957344
## 30 0.794328235
## 31 1.000000000

Training

set.seed(1) # for CV
fit.lin <- train(
  x = rec,
  data = Wage.train,
  method = "svmLinear",
  tuneGrid = tg,
  trControl  = tc
)

Model Parameters

fit.lin 
## Support Vector Machines with Linear Kernel 
## 
## 2400 samples
##   11 predictor
##    2 classes: 'high', 'low' 
## 
## Recipe steps: dummy, range 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1920, 1920, 1920, 1920, 1920 
## Resampling results across tuning parameters:
## 
##   C            Accuracy   Kappa    
##   0.001000000  0.7629167  0.0000000
##   0.001258925  0.7629167  0.0000000
##   0.001584893  0.7629167  0.0000000
##   0.001995262  0.7629167  0.0000000
##   0.002511886  0.7750000  0.1830816
##   0.003162278  0.7900000  0.3245266
##   0.003981072  0.7900000  0.3245266
##   0.005011872  0.7900000  0.3245266
##   0.006309573  0.7900000  0.3245266
##   0.007943282  0.7900000  0.3245266
##   0.010000000  0.7900000  0.3245266
##   0.012589254  0.7900000  0.3245266
##   0.015848932  0.7900000  0.3245266
##   0.019952623  0.7900000  0.3245266
##   0.025118864  0.7900000  0.3245266
##   0.031622777  0.7900000  0.3245266
##   0.039810717  0.7900000  0.3245266
##   0.050118723  0.7900000  0.3245266
##   0.063095734  0.7900000  0.3245266
##   0.079432823  0.7900000  0.3245266
##   0.100000000  0.7900000  0.3245266
##   0.125892541  0.7900000  0.3245266
##   0.158489319  0.7900000  0.3245266
##   0.199526231  0.7900000  0.3245266
##   0.251188643  0.7900000  0.3245266
##   0.316227766  0.7900000  0.3245266
##   0.398107171  0.7900000  0.3245266
##   0.501187234  0.7900000  0.3245266
##   0.630957344  0.7900000  0.3245266
##   0.794328235  0.7900000  0.3245266
##   1.000000000  0.7900000  0.3245266
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.003162278.

Best fit with cost = 0.0032

# Plot CP by accuracy:
plot(fit.lin, xTrans = log)

## Explore the final model ----------------------

fit.lin$finalModel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 0.00316227766016838 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 1149 
## 
## Objective Function Value : -3.4237 
## Training error : 0.21

Test Model

Wage.test$pred_lin <- predict(fit.lin, newdata = Wage.test)

confusionMatrix(Wage.test$pred_lin, Wage.test$wage_Q)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low
##       high   60  29
##       low   117 394
##                                           
##                Accuracy : 0.7567          
##                  95% CI : (0.7203, 0.7905)
##     No Information Rate : 0.705           
##     P-Value [Acc > NIR] : 0.002764        
##                                           
##                   Kappa : 0.3161          
##                                           
##  Mcnemar's Test P-Value : 6.014e-13       
##                                           
##             Sensitivity : 0.3390          
##             Specificity : 0.9314          
##          Pos Pred Value : 0.6742          
##          Neg Pred Value : 0.7710          
##              Prevalence : 0.2950          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.1483          
##       Balanced Accuracy : 0.6352          
##                                           
##        'Positive' Class : high            
## 

Polynomial SVM

tg <- expand.grid(
  C = 10 ^ seq(-3, 0, by = 0.1), # [0, 1]
  degree = 2, # [1, Inf]
  scale = 1 # [0, Inf]
)

Train Poly

set.seed(1) 
fit.poly2 <- train(
  x = rec,
  data = Wage.train,
  method = "svmPoly",
  tuneGrid = tg,
  trControl  = tc
)

Tuned Poly

fit.poly2 # Best fit with cost = 0.1995262
## Support Vector Machines with Polynomial Kernel 
## 
## 2400 samples
##   11 predictor
##    2 classes: 'high', 'low' 
## 
## Recipe steps: dummy, range 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1920, 1920, 1920, 1920, 1920 
## Resampling results across tuning parameters:
## 
##   C            Accuracy   Kappa    
##   0.001000000  0.7900000  0.3164695
##   0.001258925  0.7900000  0.3164695
##   0.001584893  0.7908333  0.3174200
##   0.001995262  0.7908333  0.3174200
##   0.002511886  0.7908333  0.3174200
##   0.003162278  0.7920833  0.3199458
##   0.003981072  0.7920833  0.3199458
##   0.005011872  0.7920833  0.3199458
##   0.006309573  0.7920833  0.3199458
##   0.007943282  0.7895833  0.3127696
##   0.010000000  0.7891667  0.3119495
##   0.012589254  0.7891667  0.3119495
##   0.015848932  0.7891667  0.3119495
##   0.019952623  0.7891667  0.3119495
##   0.025118864  0.7887500  0.3111353
##   0.031622777  0.7887500  0.3111353
##   0.039810717  0.7887500  0.3111353
##   0.050118723  0.7891667  0.3119495
##   0.063095734  0.7891667  0.3119495
##   0.079432823  0.7887500  0.3111353
##   0.100000000  0.7891667  0.3119495
##   0.125892541  0.7891667  0.3119495
##   0.158489319  0.7887500  0.3111353
##   0.199526231  0.7891667  0.3119495
##   0.251188643  0.7887500  0.3111353
##   0.316227766  0.7887500  0.3111353
##   0.398107171  0.7891667  0.3119495
##   0.501187234  0.7891667  0.3119495
##   0.630957344  0.7891667  0.3119495
##   0.794328235  0.7891667  0.3119495
##   1.000000000  0.7891667  0.3119495
## 
## Tuning parameter 'degree' was held constant at a value of 2
## Tuning
##  parameter 'scale' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 1 and C
##  = 0.003162278.
fit.poly2$bestTune
##   degree scale           C
## 6      2     1 0.003162278
plot(fit.poly2, xTran = log)

Best C = 0.0032

Wage.test$pred_poly2 <- predict(fit.poly2, newdata = Wage.test)

Compare Models

confusionMatrix(Wage.test$pred_lin, Wage.test$wage_Q)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low
##       high   60  29
##       low   117 394
##                                           
##                Accuracy : 0.7567          
##                  95% CI : (0.7203, 0.7905)
##     No Information Rate : 0.705           
##     P-Value [Acc > NIR] : 0.002764        
##                                           
##                   Kappa : 0.3161          
##                                           
##  Mcnemar's Test P-Value : 6.014e-13       
##                                           
##             Sensitivity : 0.3390          
##             Specificity : 0.9314          
##          Pos Pred Value : 0.6742          
##          Neg Pred Value : 0.7710          
##              Prevalence : 0.2950          
##          Detection Rate : 0.1000          
##    Detection Prevalence : 0.1483          
##       Balanced Accuracy : 0.6352          
##                                           
##        'Positive' Class : high            
## 
confusionMatrix(Wage.test$pred_poly2, Wage.test$wage_Q)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low
##       high   57  29
##       low   120 394
##                                           
##                Accuracy : 0.7517          
##                  95% CI : (0.7151, 0.7858)
##     No Information Rate : 0.705           
##     P-Value [Acc > NIR] : 0.006279        
##                                           
##                   Kappa : 0.298           
##                                           
##  Mcnemar's Test P-Value : 1.667e-13       
##                                           
##             Sensitivity : 0.3220          
##             Specificity : 0.9314          
##          Pos Pred Value : 0.6628          
##          Neg Pred Value : 0.7665          
##              Prevalence : 0.2950          
##          Detection Rate : 0.0950          
##    Detection Prevalence : 0.1433          
##       Balanced Accuracy : 0.6267          
##                                           
##        'Positive' Class : high            
## 

Compare Models Report

The Linear Model demonstrates slightly better performance in accuracy, Kappa, sensitivity, positive predictive value, and balanced accuracy, making it a marginally better choice overall compared to the Polynomial Model. Both models show the same effectiveness in correctly identifying actual low wage subjects, as indicated by their identical specificity.