למידת מכונה תרגיל 2
מגיש: אלדד אביב 206836165
Classification and Tuning
# Load ISRL to get Smarket dataset
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
# Load tidyverse for data manipulations
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load rsample to split data
library(rsample)
## Warning: package 'rsample' was built under R version 4.3.3
# Load recipes for model configuration
library(recipes)
## Warning: package 'recipes' was built under R version 4.3.3
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
# Load caret for model fitting and prediction
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# Load yardstick to evaluate model performance
library(yardstick)
## Warning: package 'yardstick' was built under R version 4.3.3
##
## Attaching package: 'yardstick'
##
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:readr':
##
## spec
# Create a df called Smarket from ISRL package
data(Smarket, package = "ISLR")
contrasts(Data$Variable) - Display encoding of Variable
from Data
contrasts(Smarket$Direction)
## Up
## Down 0
## Up 1
table(Data$Variable) - Create a contingency table of
Variable from Data
proportions(table(Data$Varibale)) - Display proportions
of Variables values from Data
table(Smarket$Direction)
##
## Down Up
## 602 648
proportions(table(Smarket$Direction))
##
## Down Up
## 0.4816 0.5184
set.seed(1234) - make random split process reproducible,
running the same split code multiple times will yield the same
results.
initial_split() - From rsample library,
split data into training and test sets. (arguments: df, training set
proportion).
set.seed(1234)
splits <- initial_split(Smarket, prop = 0.7) # 70% train data
Smarket.train <- training(splits) # Define training set
Smarket.test <- testing(splits) # Define testing set
Model configuration: Predict Direction using Lag1 + Lag2 + Lag3
rec <- recipe(Direction ~ Lag1 + Lag2 + Lag3,
data = Smarket.train)
Train Control:
tc <- trainControl(method = "cv", number = 10, # Use Cross-Validation with 10 folds
selectionFunction = "best") # Choose best perfomrance across 10 folds
Train model:
LogRegfit <- train(
x = rec,
data = Smarket.train, # Choose trainng data
method = "glm", family = binomial("logit"), # For logistic regression
trControl = tc
)
predict(trained_model, newdata = test_set)
type = "prob" - Probabilities of the observations
belonging to each class
type = "raw" - Class prediction base on 0.5 cutoff
# output probabilities of the form:
# p(Y = 1|X) -> p(direction is UP given the specific Xs).
# This is relevant for classification problems of course
predicted.probs <- predict(LogRegfit, newdata = Smarket.test, type = "prob")
head(predicted.probs) # Predict prob for each class
## Down Up
## 1 0.5113228 0.4886772
## 2 0.5004720 0.4995280
## 3 0.5311309 0.4688691
## 4 0.4897712 0.5102288
## 5 0.4825237 0.5174763
## 6 0.5077210 0.4922790
predicted.classes <- predict(LogRegfit, newdata = Smarket.test, type = "raw")
head(predicted.classes) # Predict Class
## [1] Down Down Down Up Up Down
## Levels: Down Up
conf_mat <- confusionMatrix(predicted.classes, # Predicted classes
Smarket.test$Direction, # True Classes,
positive = "Up")
conf_mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction Down Up
## Down 45 49
## Up 137 144
##
## Accuracy : 0.504
## 95% CI : (0.4522, 0.5557)
## No Information Rate : 0.5147
## P-Value [Acc > NIR] : 0.6791
##
## Kappa : -0.0067
##
## Mcnemar's Test P-Value : 1.781e-10
##
## Sensitivity : 0.7461
## Specificity : 0.2473
## Pos Pred Value : 0.5125
## Neg Pred Value : 0.4787
## Prevalence : 0.5147
## Detection Rate : 0.3840
## Detection Prevalence : 0.7493
## Balanced Accuracy : 0.4967
##
## 'Positive' Class : Up
##
# Calculate cv Error
cv_accuracy <- mean(LogRegfit$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.4800679
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
## Accuracy
## 0.496
# Load Caravan dataset from ISLR
data("Caravan", package = "ISLR")
table(Caravan$Purchase)
##
## No Yes
## 5474 348
proportions(table(Caravan$Purchase))
##
## No Yes
## 0.94022673 0.05977327
set.seed(9)
splits <- initial_split(Caravan, prop = 0.7)
Caravan.train <- training(splits)
Caravan.test <- testing(splits)
rec.knn <- recipe(Purchase ~ MOSTYPE + MOSHOOFD + MOPLLAAG,
data = Caravan.train) |>
step_range(all_numeric_predictors()) # MIN-MAX Scaling
tc <- trainControl(method = "cv", number = 10,
selectionFunction = "best")
tg <- expand.grid(
k = c(1, 5, 20) # k options
)
# Prepare the recipe
rec.knn <- prep(rec.knn)
# Apply the transformations on data
Caravan.train <- bake(rec.knn, new_data = NULL) # Bake training set
Caravan.test <- bake(rec.knn, new_data = Caravan.test)# Bake test set
set.seed(10)
knn.fit.10CV <- train(
x = rec.knn,
data = Caravan.train,
method = "knn",
tuneGrid = tg,
trControl = tc,
metric = "Accuracy"
)
plot(knn.fit.10CV) # Plot ACC of each k
knn.fit.10CV$bestTune # Return best k
## k
## 3 20
predicted.classes.10CV <- predict(knn.fit.10CV, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.10CV, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.10CV$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.0549682
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
## Accuracy
## 0.07097882
tc <- trainControl(method = "boot", number = 50)
knn.fit.50boot <- train(
x = rec.knn,
data = Caravan.train,
method = "knn",
tuneGrid = tg,
trControl = tc,
metric = "Accuracy"
)
plot(knn.fit.50boot) # Plot ACC of each k
knn.fit.50boot$bestTune # Return best k
## k
## 3 20
predicted.classes.50boot <- predict(knn.fit.50boot, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.50boot, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.50boot$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.05571996
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
## Accuracy
## 0.07097882
tc <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
knn.fit.10CVr <- train(
x = rec.knn,
data = Caravan.train,
method = "knn",
tuneGrid = tg,
trControl = tc,
metric = "Accuracy"
)
plot(knn.fit.10CVr) # Plot ACC of each k
knn.fit.10CVr$bestTune # Return best k
## k
## 3 20
predicted.classes.10CVr <- predict(knn.fit.10CVr, newdata = Caravan.test, type = "raw")
conf_mat <- confusionMatrix(predicted.classes.10CVr, Caravan.test$Purchase, positive = "Yes")
# Calculate cv Error
cv_accuracy <- mean(knn.fit.10CVr$resample$Accuracy)
cv_error <- 1 - cv_accuracy
cv_error
## [1] 0.05496799
# Calculate Test error
test_accuracy <- conf_mat$overall['Accuracy']
test_error <- 1 - test_accuracy
test_error
## Accuracy
## 0.07097882
ההיפרפמטר K = 20 הניב את הביצועים הטובים ביותר בשלושת המודלים ביחס שונה בין כל מודל
חלוקה לעשרה פולדים הניבה טעות CV נמוכה יותר