למידת מכונה תרגיל 2

מגיש: אלדד אביב 206836165

Classification and Tuning

0.0.1 Load Libraries:

# Load ISRL to get Smarket dataset
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
# Load tidyverse for data manipulations
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load rsample to split data
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.3
# Load recipes for model configuration
library(recipes)
## Warning: package 'recipes' was built under R version 4.3.3
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
# Load caret for model fitting and prediction
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
# Load yardstick to evaluate model performance
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.3
## 
## Attaching package: 'yardstick'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:readr':
## 
##     spec

0.0.2 Load data:

# Create a df called Smarket from ISRL package
data(Smarket, package = "ISLR")

contrasts(Data$Variable) - Display encoding of Variable from Data

contrasts(Smarket$Direction)
##      Up
## Down  0
## Up    1

table(Data$Variable) - Create a contingency table of Variable from Data

proportions(table(Data$Varibale)) - Display proportions of Variables values from Data

table(Smarket$Direction)
## 
## Down   Up 
##  602  648
proportions(table(Smarket$Direction))
## 
##   Down     Up 
## 0.4816 0.5184

1 Logistic Regression for stock direction classification

1.0.1 Data Splitting

set.seed(1234) - make random split process reproducible, running the same split code multiple times will yield the same results.

initial_split() - From rsample library, split data into training and test sets. (arguments: df, training set proportion).

set.seed(1234)
splits <- initial_split(Smarket, prop = 0.7) # 70% train data
Smarket.train <- training(splits) # Define training set
Smarket.test  <- testing(splits)  # Define testing set

1.0.2 Fitting logistic regression on train data using caret

Model configuration: Predict Direction using Lag1 + Lag2 + Lag3

rec <- recipe(Direction ~ Lag1 + Lag2 + Lag3, 
              data = Smarket.train)

Train Control:

tc <- trainControl(method = "cv", number = 10, # Use Cross-Validation with 10 folds
                   selectionFunction = "best") # Choose best perfomrance across 10 folds

Train model:

LogRegfit <- train(
  x = rec,
  data = Smarket.train, # Choose trainng data
  method = "glm", family = binomial("logit"), # For logistic regression
  trControl = tc
)

1.0.3 Prediction for test set:

predict(trained_model, newdata = test_set)

type = "prob" - Probabilities of the observations belonging to each class

type = "raw" - Class prediction base on 0.5 cutoff

# output probabilities of the form:
# p(Y = 1|X) -> p(direction is UP given the specific Xs).
# This is relevant for classification problems of course

predicted.probs <- predict(LogRegfit, newdata = Smarket.test, type = "prob")
head(predicted.probs) # Predict prob for each class
##        Down        Up
## 1 0.5113228 0.4886772
## 2 0.5004720 0.4995280
## 3 0.5311309 0.4688691
## 4 0.4897712 0.5102288
## 5 0.4825237 0.5174763
## 6 0.5077210 0.4922790
predicted.classes <- predict(LogRegfit, newdata = Smarket.test, type = "raw")
head(predicted.classes) # Predict Class 
## [1] Down Down Down Up   Up   Down
## Levels: Down Up

1.0.4 Model Evaluation:

conf_mat <- confusionMatrix(predicted.classes, # Predicted classes
                Smarket.test$Direction, # True Classes,
                positive = "Up")
conf_mat
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Down  Up
##       Down   45  49
##       Up    137 144
##                                           
##                Accuracy : 0.504           
##                  95% CI : (0.4522, 0.5557)
##     No Information Rate : 0.5147          
##     P-Value [Acc > NIR] : 0.6791          
##                                           
##                   Kappa : -0.0067         
##                                           
##  Mcnemar's Test P-Value : 1.781e-10       
##                                           
##             Sensitivity : 0.7461          
##             Specificity : 0.2473          
##          Pos Pred Value : 0.5125          
##          Neg Pred Value : 0.4787          
##              Prevalence : 0.5147          
##          Detection Rate : 0.3840          
##    Detection Prevalence : 0.7493          
##       Balanced Accuracy : 0.4967          
##                                           
##        'Positive' Class : Up              
## 
# Calculate cv Error
cv_accuracy <- mean(LogRegfit$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.4800679
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
## Accuracy 
##    0.496

2 KNN: Tuning k with 10-folds CV

2.1 1. Fit KNN with k=1, 5 and 20 using 10-folds CV and assess performance on test data. what were the chosen tuning parameter, cv error and test error?

2.1.1 Load data

# Load Caravan dataset from ISLR
data("Caravan", package = "ISLR")
table(Caravan$Purchase)
## 
##   No  Yes 
## 5474  348
proportions(table(Caravan$Purchase))
## 
##         No        Yes 
## 0.94022673 0.05977327

2.1.2 Splitting Data

set.seed(9)
splits <- initial_split(Caravan, prop = 0.7)
Caravan.train <- training(splits)
Caravan.test  <- testing(splits)

2.1.3 Model Configurations

rec.knn <- recipe(Purchase ~ MOSTYPE + MOSHOOFD + MOPLLAAG,
                  data = Caravan.train) |>
  step_range(all_numeric_predictors()) # MIN-MAX Scaling
tc <- trainControl(method = "cv", number = 10,
                   selectionFunction = "best")
tg <- expand.grid(
  k = c(1, 5, 20) # k options
)
# Prepare the recipe
rec.knn <- prep(rec.knn)
# Apply the transformations on data
Caravan.train <- bake(rec.knn, new_data = NULL) # Bake training set
Caravan.test  <- bake(rec.knn, new_data = Caravan.test)# Bake test set

2.1.4 Train Model

set.seed(10)
knn.fit.10CV <- train(
  x = rec.knn,
  data = Caravan.train,
  method = "knn",
  tuneGrid = tg,
  trControl = tc,
  metric = "Accuracy"
)

2.1.5 10CV Model Evaluation

plot(knn.fit.10CV) # Plot ACC of each k

knn.fit.10CV$bestTune # Return best k
##    k
## 3 20
predicted.classes.10CV <- predict(knn.fit.10CV, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.10CV, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.10CV$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.0549682
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
##   Accuracy 
## 0.07097882

2.2 2. Repeat (1) using either 50 bootstrap samples

tc <- trainControl(method = "boot", number = 50)

2.2.1 Train Model

knn.fit.50boot <- train(
  x = rec.knn,
  data = Caravan.train,
  method = "knn",
  tuneGrid = tg,
  trControl = tc,
  metric = "Accuracy"
)

2.2.2 50boot Model Evaulation

plot(knn.fit.50boot) # Plot ACC of each k

knn.fit.50boot$bestTune # Return best k
##    k
## 3 20
predicted.classes.50boot <- predict(knn.fit.50boot, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.50boot, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.50boot$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.05571996
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
##   Accuracy 
## 0.07097882

2.3 repeated 10-fold CV.

2.3.1 Train Model

tc <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
knn.fit.10CVr <- train(
  x = rec.knn,
  data = Caravan.train,
  method = "knn",
  tuneGrid = tg,
  trControl = tc,
  metric = "Accuracy"
)

2.3.2 10CVrepeated Model Evaluation

plot(knn.fit.10CVr) # Plot ACC of each k

knn.fit.10CVr$bestTune # Return best k
##    k
## 3 20
predicted.classes.10CVr <- predict(knn.fit.10CVr, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.10CVr, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.10CVr$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.05496799
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
##   Accuracy 
## 0.07097882

2.4 3. How did the resampling methods (1 vs 2) differ in their results?

ההיפרפמטר K = 20 הניב את הביצועים הטובים ביותר בשלושת המודלים ביחס שונה בין כל מודל

חלוקה לעשרה פולדים הניבה טעות CV נמוכה יותר