hw5 Emmanuel

library(ISLR2)
library(ggplot2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.1.8
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(class)
library(boot)

## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:lattice':
## 
##     melanoma

data("Default")
str(Default)

## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...

attach(Default)

default.lm <- glm(default ~ income + balance , data = Default,family = "binomial")
summary(default.lm)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = Default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

lm.probs = predict(default.lm,Default$default, type="response")
lm.pred = rep("No", length(Default$default))
lm.pred[lm.probs > 0.5] = "Yes"
mean(lm.pred!=Default$default)

## [1] 0.0263

confusionMatrix(factor(lm.pred),Default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  9629  225
##        Yes   38  108
##                                           
##                Accuracy : 0.9737          
##                  95% CI : (0.9704, 0.9767)
##     No Information Rate : 0.9667          
##     P-Value [Acc > NIR] : 3.067e-05       
##                                           
##                   Kappa : 0.4396          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9961          
##             Specificity : 0.3243          
##          Pos Pred Value : 0.9772          
##          Neg Pred Value : 0.7397          
##              Prevalence : 0.9667          
##          Detection Rate : 0.9629          
##    Detection Prevalence : 0.9854          
##       Balanced Accuracy : 0.6602          
##                                           
##        'Positive' Class : No              
##

set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.7)
train_default <- Default[train, ]
test_default <- Default[-train, ]

sample1 <- glm(default ~ income + balance , data = train_default,family = "binomial")
summary(sample1)

## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = train_default)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4768  -0.1461  -0.0572  -0.0209   3.7196  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.145e+01  5.078e-01 -22.548  < 2e-16 ***
## income       1.790e-05  5.899e-06   3.034  0.00241 ** 
## balance      5.675e-03  2.652e-04  21.402  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2163.7  on 6999  degrees of freedom
## Residual deviance: 1139.0  on 6997  degrees of freedom
## AIC: 1145
## 
## Number of Fisher Scoring iterations: 8

lm.prob = predict(sample1,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")

mean(lm.class!=test_default$default)

## [1] 0.02266667

confusionMatrix(factor(lm.class),test_default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2910   60
##        Yes    8   22
##                                           
##                Accuracy : 0.9773          
##                  95% CI : (0.9714, 0.9824)
##     No Information Rate : 0.9727          
##     P-Value [Acc > NIR] : 0.06217         
##                                           
##                   Kappa : 0.3838          
##                                           
##  Mcnemar's Test P-Value : 6.224e-10       
##                                           
##             Sensitivity : 0.9973          
##             Specificity : 0.2683          
##          Pos Pred Value : 0.9798          
##          Neg Pred Value : 0.7333          
##              Prevalence : 0.9727          
##          Detection Rate : 0.9700          
##    Detection Prevalence : 0.9900          
##       Balanced Accuracy : 0.6328          
##                                           
##        'Positive' Class : No              
##

#80/20 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.8)
train_default <- Default[train, ]
test_default <- Default[-train, ]

sample2 <- glm(default ~ income + balance , data = train_default,family = "binomial")


lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")


mean(lm.class!=test_default$default)

## [1] 0.0235

confusionMatrix(factor(lm.class),test_default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1938   41
##        Yes    6   15
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9689, 0.9827)
##     No Information Rate : 0.972           
##     P-Value [Acc > NIR] : 0.1229          
##                                           
##                   Kappa : 0.3801          
##                                           
##  Mcnemar's Test P-Value : 7.071e-07       
##                                           
##             Sensitivity : 0.9969          
##             Specificity : 0.2679          
##          Pos Pred Value : 0.9793          
##          Neg Pred Value : 0.7143          
##              Prevalence : 0.9720          
##          Detection Rate : 0.9690          
##    Detection Prevalence : 0.9895          
##       Balanced Accuracy : 0.6324          
##                                           
##        'Positive' Class : No              
##

#90/10 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.9)
train_default <- Default[train, ]
test_default <- Default[-train, ]

sample3 <- glm(default ~ income + balance , data = train_default,family = "binomial")

lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")


mean(lm.class!=test_default$default)

## [1] 0.02

confusionMatrix(factor(lm.class),test_default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  976  16
##        Yes   4   4
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9693, 0.9877)
##     No Information Rate : 0.98            
##     P-Value [Acc > NIR] : 0.55910         
##                                           
##                   Kappa : 0.2775          
##                                           
##  Mcnemar's Test P-Value : 0.01391         
##                                           
##             Sensitivity : 0.9959          
##             Specificity : 0.2000          
##          Pos Pred Value : 0.9839          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.9800          
##          Detection Rate : 0.9760          
##    Detection Prevalence : 0.9920          
##       Balanced Accuracy : 0.5980          
##                                           
##        'Positive' Class : No              
##

#85/15 split
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.85)
train_default <- Default[train, ]
test_default <- Default[-train, ]

sample4 <- glm(default ~ income + balance , data = train_default,family = "binomial")

lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")


mean(lm.class!=test_default$default)

## [1] 0.022

confusionMatrix(factor(lm.class),test_default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1455   28
##        Yes    5   12
##                                           
##                Accuracy : 0.978           
##                  95% CI : (0.9692, 0.9848)
##     No Information Rate : 0.9733          
##     P-Value [Acc > NIR] : 0.1479976       
##                                           
##                   Kappa : 0.4117          
##                                           
##  Mcnemar's Test P-Value : 0.0001283       
##                                           
##             Sensitivity : 0.9966          
##             Specificity : 0.3000          
##          Pos Pred Value : 0.9811          
##          Neg Pred Value : 0.7059          
##              Prevalence : 0.9733          
##          Detection Rate : 0.9700          
##    Detection Prevalence : 0.9887          
##       Balanced Accuracy : 0.6483          
##                                           
##        'Positive' Class : No              
##

#Based on these results, we can see that 90/10 split offers the best result with 98% accuracy and 2% test error rate, which is below the initial logistic regression validation set error score (2.6%)

#adding dummy variable for student
set.seed(111)
train <- sample(nrow(Default),nrow(Default)*0.7)
train_default <- Default[train, ]
test_default <- Default[-train, ]

sample5 <- glm(default ~ income + balance + student , data = train_default,family = "binomial")

lm.prob = predict(sample2,test_default, type="response")
lm.class <-ifelse(lm.prob>0.5,"Yes","No")


mean(lm.class!=test_default$default)

## [1] 0.02266667

confusionMatrix(factor(lm.class),test_default$default)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  2910   60
##        Yes    8   22
##                                           
##                Accuracy : 0.9773          
##                  95% CI : (0.9714, 0.9824)
##     No Information Rate : 0.9727          
##     P-Value [Acc > NIR] : 0.06217         
##                                           
##                   Kappa : 0.3838          
##                                           
##  Mcnemar's Test P-Value : 6.224e-10       
##                                           
##             Sensitivity : 0.9973          
##             Specificity : 0.2683          
##          Pos Pred Value : 0.9798          
##          Neg Pred Value : 0.7333          
##              Prevalence : 0.9727          
##          Detection Rate : 0.9700          
##    Detection Prevalence : 0.9900          
##       Balanced Accuracy : 0.6328          
##                                           
##        'Positive' Class : No              
##

#By adding the dummy variable for student, it doesn't lead to reduction in the test error (2.2%). So it is best to not adding the dummy variable for student in this case.

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

hw5 Emmanuel

Emmanuel Ferdinand Anggawirja

2023-05-31

R Markdown

Including Plots