library(ISLR2)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.0 ──
## ✔ broom 1.0.4 ✔ rsample 1.1.1
## ✔ dials 1.2.0 ✔ tibble 3.2.1
## ✔ ggplot2 3.4.2 ✔ tidyr 1.3.0
## ✔ infer 1.0.4 ✔ tune 1.1.1
## ✔ modeldata 1.1.0 ✔ workflows 1.1.3
## ✔ parsnip 1.1.0 ✔ workflowsets 1.0.1
## ✔ purrr 1.0.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.6
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:purrr':
##
## lift
data <- Default
data <- data %>% mutate(default=ifelse(default=="Yes",1,0)) %>% mutate(student=ifelse(student=="Yes",1,0))
set.seed(123)
split <- initial_split(data, pop=0.8, strat= default)
train <- training(split)
test <- testing(split)
#Assessing model accuracy
cv_model1 <- glm(default ~ balance,family="binomial", data = train)
tidy(cv_model1)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -10.8 0.423 -25.4 8.90e-143
## 2 balance 0.00554 0.000256 21.6 1.22e-103
cv_model2 <- glm(default ~ student, family="binomial", data = train)
tidy(cv_model2)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3.53 0.0826 -42.8 0
## 2 student 0.429 0.134 3.20 0.00137
cv_model3 <- glm(default ~., family="binomial",data = train)
tidy(cv_model3)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -10.9 0.575 -18.9 7.51e- 80
## 2 student -0.700 0.275 -2.54 1.10e- 2
## 3 balance 0.00578 0.000270 21.4 1.41e-101
## 4 income 0.000000568 0.00000961 0.0591 9.53e- 1
#Model accuracy (out of sample performance measures)
train$default <- factor(train$default, levels = c("0", "1"))
test$default <- factor(test$default, levels = c("0", "1"))
pred_default <- predict(cv_model3,test, type="response")
pred_result <- test %>% select(default) %>% bind_cols(pred_default)
## New names:
## • `` -> `...2`
colnames(pred_result) <- c("actual","prediction")
pred_default <- as.factor(pred_default)
str(pred_default)
## Factor w/ 2500 levels "9.43883611530646e-06",..: 1252 660 1740 947 2254 1490 2101 1041 463 962 ...
## - attr(*, "names")= chr [1:2500] "6" "19" "23" "24" ...
#Model accuracy (confusion matrix) Model accuracy (ROC curve)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
actual<- as.numeric(pred_result$actual)
prediction <- as.numeric(pred_result$prediction)
roc_data <- roc(actual, prediction)
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
auc_value <- auc(roc_data)
plot(roc_data, print.auc = TRUE, auc.polygon = TRUE,
legacy.axes = TRUE, grid = TRUE, main = paste0("ROC Curve (AUC = ", round(auc_value, 3), ")"))
