ML - Ex4
Eldad Aviv: 206836165
library(ggplot2)
library(rsample)
library(recipes)
library(caret)
library(yardstick)
library(kernlab)
data("Wage", package = "ISLR")
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
plot(Wage$wage)
It appears as there is a distinct class of high wage in our data. thus, we will try to predict the highest quartile with the following variables:
# Calculate quartiles
quartiles <- quantile(Wage$wage, probs = c(0.25, 0.50, 0.75, 1.00))
# Assign Q1, Q2 & Q3 as low, and Q4 as high
Wage$wage_Q <- ifelse(Wage$wage <= quartiles[3], "low", "high")
Convert Variables to Factors
Wage$race <- as.factor(Wage$race)
Wage$education <- as.factor(Wage$education)
Wage$jobclass <- as.factor(Wage$jobclass)
Wage$wage_Q <- as.factor(Wage$wage_Q)
set.seed(1)
splits <- initial_split(Wage, prop = 0.8)
Wage.train <- training(splits)
Wage.test <- testing(splits)
# Predict wage_Q with Age + Race + Education + Job Class
rec <- recipe(wage_Q ~ age + race + education + jobclass,
data = Wage.train) |>
step_dummy(all_nominal_predictors()) |> # Convert factors to dummy
step_range(all_numeric_predictors()) # Apply min-max normalization
Use 5-folds Cross Validation
tc <- trainControl(method = "cv", number = 5)
Tune Cost Parameter
# This grid will test 31 different values of cost parameter C between 0.001 and 1
tg <- expand.grid(
C = 10 ^ seq(-3, 0, by = 0.1) # [0, 1]
)
tg
## C
## 1 0.001000000
## 2 0.001258925
## 3 0.001584893
## 4 0.001995262
## 5 0.002511886
## 6 0.003162278
## 7 0.003981072
## 8 0.005011872
## 9 0.006309573
## 10 0.007943282
## 11 0.010000000
## 12 0.012589254
## 13 0.015848932
## 14 0.019952623
## 15 0.025118864
## 16 0.031622777
## 17 0.039810717
## 18 0.050118723
## 19 0.063095734
## 20 0.079432823
## 21 0.100000000
## 22 0.125892541
## 23 0.158489319
## 24 0.199526231
## 25 0.251188643
## 26 0.316227766
## 27 0.398107171
## 28 0.501187234
## 29 0.630957344
## 30 0.794328235
## 31 1.000000000
set.seed(1) # for CV
fit.lin <- train(
x = rec,
data = Wage.train,
method = "svmLinear",
tuneGrid = tg,
trControl = tc
)
fit.lin
## Support Vector Machines with Linear Kernel
##
## 2400 samples
## 11 predictor
## 2 classes: 'high', 'low'
##
## Recipe steps: dummy, range
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1920, 1920, 1920, 1920, 1920
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.001000000 0.7629167 0.0000000
## 0.001258925 0.7629167 0.0000000
## 0.001584893 0.7629167 0.0000000
## 0.001995262 0.7629167 0.0000000
## 0.002511886 0.7750000 0.1830816
## 0.003162278 0.7900000 0.3245266
## 0.003981072 0.7900000 0.3245266
## 0.005011872 0.7900000 0.3245266
## 0.006309573 0.7900000 0.3245266
## 0.007943282 0.7900000 0.3245266
## 0.010000000 0.7900000 0.3245266
## 0.012589254 0.7900000 0.3245266
## 0.015848932 0.7900000 0.3245266
## 0.019952623 0.7900000 0.3245266
## 0.025118864 0.7900000 0.3245266
## 0.031622777 0.7900000 0.3245266
## 0.039810717 0.7900000 0.3245266
## 0.050118723 0.7900000 0.3245266
## 0.063095734 0.7900000 0.3245266
## 0.079432823 0.7900000 0.3245266
## 0.100000000 0.7900000 0.3245266
## 0.125892541 0.7900000 0.3245266
## 0.158489319 0.7900000 0.3245266
## 0.199526231 0.7900000 0.3245266
## 0.251188643 0.7900000 0.3245266
## 0.316227766 0.7900000 0.3245266
## 0.398107171 0.7900000 0.3245266
## 0.501187234 0.7900000 0.3245266
## 0.630957344 0.7900000 0.3245266
## 0.794328235 0.7900000 0.3245266
## 1.000000000 0.7900000 0.3245266
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 0.003162278.
Best fit with cost = 0.0032
# Plot CP by accuracy:
plot(fit.lin, xTrans = log)
## Explore the final model ----------------------
fit.lin$finalModel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.00316227766016838
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 1149
##
## Objective Function Value : -3.4237
## Training error : 0.21
Wage.test$pred_lin <- predict(fit.lin, newdata = Wage.test)
confusionMatrix(Wage.test$pred_lin, Wage.test$wage_Q)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 60 29
## low 117 394
##
## Accuracy : 0.7567
## 95% CI : (0.7203, 0.7905)
## No Information Rate : 0.705
## P-Value [Acc > NIR] : 0.002764
##
## Kappa : 0.3161
##
## Mcnemar's Test P-Value : 6.014e-13
##
## Sensitivity : 0.3390
## Specificity : 0.9314
## Pos Pred Value : 0.6742
## Neg Pred Value : 0.7710
## Prevalence : 0.2950
## Detection Rate : 0.1000
## Detection Prevalence : 0.1483
## Balanced Accuracy : 0.6352
##
## 'Positive' Class : high
##
tg <- expand.grid(
C = 10 ^ seq(-3, 0, by = 0.1), # [0, 1]
degree = 2, # [1, Inf]
scale = 1 # [0, Inf]
)
set.seed(1)
fit.poly2 <- train(
x = rec,
data = Wage.train,
method = "svmPoly",
tuneGrid = tg,
trControl = tc
)
fit.poly2 # Best fit with cost = 0.1995262
## Support Vector Machines with Polynomial Kernel
##
## 2400 samples
## 11 predictor
## 2 classes: 'high', 'low'
##
## Recipe steps: dummy, range
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1920, 1920, 1920, 1920, 1920
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.001000000 0.7900000 0.3164695
## 0.001258925 0.7900000 0.3164695
## 0.001584893 0.7908333 0.3174200
## 0.001995262 0.7908333 0.3174200
## 0.002511886 0.7908333 0.3174200
## 0.003162278 0.7920833 0.3199458
## 0.003981072 0.7920833 0.3199458
## 0.005011872 0.7920833 0.3199458
## 0.006309573 0.7920833 0.3199458
## 0.007943282 0.7895833 0.3127696
## 0.010000000 0.7891667 0.3119495
## 0.012589254 0.7891667 0.3119495
## 0.015848932 0.7891667 0.3119495
## 0.019952623 0.7891667 0.3119495
## 0.025118864 0.7887500 0.3111353
## 0.031622777 0.7887500 0.3111353
## 0.039810717 0.7887500 0.3111353
## 0.050118723 0.7891667 0.3119495
## 0.063095734 0.7891667 0.3119495
## 0.079432823 0.7887500 0.3111353
## 0.100000000 0.7891667 0.3119495
## 0.125892541 0.7891667 0.3119495
## 0.158489319 0.7887500 0.3111353
## 0.199526231 0.7891667 0.3119495
## 0.251188643 0.7887500 0.3111353
## 0.316227766 0.7887500 0.3111353
## 0.398107171 0.7891667 0.3119495
## 0.501187234 0.7891667 0.3119495
## 0.630957344 0.7891667 0.3119495
## 0.794328235 0.7891667 0.3119495
## 1.000000000 0.7891667 0.3119495
##
## Tuning parameter 'degree' was held constant at a value of 2
## Tuning
## parameter 'scale' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were degree = 2, scale = 1 and C
## = 0.003162278.
fit.poly2$bestTune
## degree scale C
## 6 2 1 0.003162278
plot(fit.poly2, xTran = log)
Best C = 0.0032
Wage.test$pred_poly2 <- predict(fit.poly2, newdata = Wage.test)
confusionMatrix(Wage.test$pred_lin, Wage.test$wage_Q)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 60 29
## low 117 394
##
## Accuracy : 0.7567
## 95% CI : (0.7203, 0.7905)
## No Information Rate : 0.705
## P-Value [Acc > NIR] : 0.002764
##
## Kappa : 0.3161
##
## Mcnemar's Test P-Value : 6.014e-13
##
## Sensitivity : 0.3390
## Specificity : 0.9314
## Pos Pred Value : 0.6742
## Neg Pred Value : 0.7710
## Prevalence : 0.2950
## Detection Rate : 0.1000
## Detection Prevalence : 0.1483
## Balanced Accuracy : 0.6352
##
## 'Positive' Class : high
##
confusionMatrix(Wage.test$pred_poly2, Wage.test$wage_Q)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 57 29
## low 120 394
##
## Accuracy : 0.7517
## 95% CI : (0.7151, 0.7858)
## No Information Rate : 0.705
## P-Value [Acc > NIR] : 0.006279
##
## Kappa : 0.298
##
## Mcnemar's Test P-Value : 1.667e-13
##
## Sensitivity : 0.3220
## Specificity : 0.9314
## Pos Pred Value : 0.6628
## Neg Pred Value : 0.7665
## Prevalence : 0.2950
## Detection Rate : 0.0950
## Detection Prevalence : 0.1433
## Balanced Accuracy : 0.6267
##
## 'Positive' Class : high
##
Compare Models Report
The Linear Model demonstrates slightly better performance in accuracy, Kappa, sensitivity, positive predictive value, and balanced accuracy, making it a marginally better choice overall compared to the Polynomial Model. Both models show the same effectiveness in correctly identifying actual low wage subjects, as indicated by their identical specificity.