library(ISLR2)
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.1.8
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
data("Default")
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
attach(Default)
default.lm <- glm(default ~ income + balance , data = Default,family = "binomial")
summary(default.lm)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
lm.probs = predict(default.lm,Default$default, type="response")
lm.pred = rep("No", length(Default$default))
lm.pred[lm.probs > 0.5] = "Yes"
mean(lm.pred!=Default$default)
## [1] 0.0263
confusionMatrix(factor(lm.pred),Default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9629 225
## Yes 38 108
##
## Accuracy : 0.9737
## 95% CI : (0.9704, 0.9767)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 3.067e-05
##
## Kappa : 0.4396
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9961
## Specificity : 0.3243
## Pos Pred Value : 0.9772
## Neg Pred Value : 0.7397
## Prevalence : 0.9667
## Detection Rate : 0.9629
## Detection Prevalence : 0.9854
## Balanced Accuracy : 0.6602
##
## 'Positive' Class : No
##
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.7)
train_default <- Default[train, ]
test_default <- Default[-train, ]
sample1 <- glm(default ~ income + balance , data = train_default,family = "binomial")
summary(sample1)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train_default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4768 -0.1461 -0.0572 -0.0209 3.7196
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.145e+01 5.078e-01 -22.548 < 2e-16 ***
## income 1.790e-05 5.899e-06 3.034 0.00241 **
## balance 5.675e-03 2.652e-04 21.402 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2163.7 on 6999 degrees of freedom
## Residual deviance: 1139.0 on 6997 degrees of freedom
## AIC: 1145
##
## Number of Fisher Scoring iterations: 8
lm.prob = predict(sample1,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")
mean(lm.class!=test_default$default)
## [1] 0.02266667
confusionMatrix(factor(lm.class),test_default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2910 60
## Yes 8 22
##
## Accuracy : 0.9773
## 95% CI : (0.9714, 0.9824)
## No Information Rate : 0.9727
## P-Value [Acc > NIR] : 0.06217
##
## Kappa : 0.3838
##
## Mcnemar's Test P-Value : 6.224e-10
##
## Sensitivity : 0.9973
## Specificity : 0.2683
## Pos Pred Value : 0.9798
## Neg Pred Value : 0.7333
## Prevalence : 0.9727
## Detection Rate : 0.9700
## Detection Prevalence : 0.9900
## Balanced Accuracy : 0.6328
##
## 'Positive' Class : No
##
#80/20 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.8)
train_default <- Default[train, ]
test_default <- Default[-train, ]
sample2 <- glm(default ~ income + balance , data = train_default,family = "binomial")
lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")
mean(lm.class!=test_default$default)
## [1] 0.0235
confusionMatrix(factor(lm.class),test_default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1938 41
## Yes 6 15
##
## Accuracy : 0.9765
## 95% CI : (0.9689, 0.9827)
## No Information Rate : 0.972
## P-Value [Acc > NIR] : 0.1229
##
## Kappa : 0.3801
##
## Mcnemar's Test P-Value : 7.071e-07
##
## Sensitivity : 0.9969
## Specificity : 0.2679
## Pos Pred Value : 0.9793
## Neg Pred Value : 0.7143
## Prevalence : 0.9720
## Detection Rate : 0.9690
## Detection Prevalence : 0.9895
## Balanced Accuracy : 0.6324
##
## 'Positive' Class : No
##
#90/10 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.9)
train_default <- Default[train, ]
test_default <- Default[-train, ]
sample3 <- glm(default ~ income + balance , data = train_default,family = "binomial")
lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")
mean(lm.class!=test_default$default)
## [1] 0.02
confusionMatrix(factor(lm.class),test_default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 976 16
## Yes 4 4
##
## Accuracy : 0.98
## 95% CI : (0.9693, 0.9877)
## No Information Rate : 0.98
## P-Value [Acc > NIR] : 0.55910
##
## Kappa : 0.2775
##
## Mcnemar's Test P-Value : 0.01391
##
## Sensitivity : 0.9959
## Specificity : 0.2000
## Pos Pred Value : 0.9839
## Neg Pred Value : 0.5000
## Prevalence : 0.9800
## Detection Rate : 0.9760
## Detection Prevalence : 0.9920
## Balanced Accuracy : 0.5980
##
## 'Positive' Class : No
##
#85/15 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.85)
train_default <- Default[train, ]
test_default <- Default[-train, ]
sample4 <- glm(default ~ income + balance , data = train_default,family = "binomial")
lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")
mean(lm.class!=test_default$default)
## [1] 0.022
confusionMatrix(factor(lm.class),test_default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1455 28
## Yes 5 12
##
## Accuracy : 0.978
## 95% CI : (0.9692, 0.9848)
## No Information Rate : 0.9733
## P-Value [Acc > NIR] : 0.1479976
##
## Kappa : 0.4117
##
## Mcnemar's Test P-Value : 0.0001283
##
## Sensitivity : 0.9966
## Specificity : 0.3000
## Pos Pred Value : 0.9811
## Neg Pred Value : 0.7059
## Prevalence : 0.9733
## Detection Rate : 0.9700
## Detection Prevalence : 0.9887
## Balanced Accuracy : 0.6483
##
## 'Positive' Class : No
##
#Based on these results, we can see that 90/10 split offers the best result with 98% accuracy and 2% test error rate, which is below the initial logistic regression validation set error score (2.6%)
#adding dummy variable for student
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.7)
train_default <- Default[train, ]
test_default <- Default[-train, ]
sample5 <- glm(default ~ income + balance + student , data = train_default,family = "binomial")
lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")
mean(lm.class!=test_default$default)
## [1] 0.02266667
confusionMatrix(factor(lm.class),test_default$default)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 2910 60
## Yes 8 22
##
## Accuracy : 0.9773
## 95% CI : (0.9714, 0.9824)
## No Information Rate : 0.9727
## P-Value [Acc > NIR] : 0.06217
##
## Kappa : 0.3838
##
## Mcnemar's Test P-Value : 6.224e-10
##
## Sensitivity : 0.9973
## Specificity : 0.2683
## Pos Pred Value : 0.9798
## Neg Pred Value : 0.7333
## Prevalence : 0.9727
## Detection Rate : 0.9700
## Detection Prevalence : 0.9900
## Balanced Accuracy : 0.6328
##
## 'Positive' Class : No
##
#By adding the dummy variable for student, it doesn't lead to reduction in the test error (2.2%). So it is best to not adding the dummy variable for student in this case.
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.