Load data

data <- Default
head(data,5)
##   default student   balance   income
## 1      No      No  729.5265 44361.63
## 2      No     Yes  817.1804 12106.13
## 3      No      No 1073.5492 31767.14
## 4      No      No  529.2506 35704.49
## 5      No      No  785.6559 38463.50

A: Fit to model

Based on the result, the income and balance are both significant to default, and both of them have positive relations with default. It means that the higher the income (balance), the higher the default probability.

set.seed(123)
model_a <- glm(default ~ income + balance, data= data, family= "binomial")
summary(model_a)
## 
## Call:
## glm(formula = default ~ income + balance, family = "binomial", 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4725  -0.1444  -0.0574  -0.0211   3.7245  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.154e+01  4.348e-01 -26.545  < 2e-16 ***
## income       2.081e-05  4.985e-06   4.174 2.99e-05 ***
## balance      5.647e-03  2.274e-04  24.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2920.6  on 9999  degrees of freedom
## Residual deviance: 1579.0  on 9997  degrees of freedom
## AIC: 1585
## 
## Number of Fisher Scoring iterations: 8

B: Estimate the error of the model

#i
set.seed(123)
data_split <- initial_split(data, prop=0.8,strata= default)
data_training <- training(data_split)
data_testing <- testing(data_split)

#ii
model_b2 <- glm(default ~ income + balance, data= data_training, family ="binomial")
predict_b2 <- predict(model_b2, new_data= data_testing, type="response")

#iii
predict_binary_b2 <- ifelse(predict_b2 > 0.5, "Yes", "No")

#iv
##mean((default[data_testing] - predict_binary_b2)^2)
##head(data_testing,5)

C: Repeat B using 3 different splits

data_split <- initial_split(data, prop=0.8,strata= default)
data_training <- training(data_split)
data_testing <- testing(data_split)

#ii
model_b2 <- glm(default ~ income + balance, data= data_training, family ="binomial")
predict_b2 <- predict(model_b2, new_data= data_testing, type="response")

#iii
predict_binary_b2 <- ifelse(predict_b2 > 0.5, "Yes", "No")

#iv
##mean((default[data_testing] - predict_binary_b2)^2)
##head(data_testing,5)
data_split <- initial_split(data, prop=0.8,strata= default)
data_training <- training(data_split)
data_testing <- testing(data_split)

#ii
model_b2 <- glm(default ~ income + balance, data= data_training, family ="binomial")
predict_b2 <- predict(model_b2, new_data= data_testing, type="response")

#iii
predict_binary_b2 <- ifelse(predict_b2 > 0.5, "Yes", "No")

#iv
##mean((default[data_testing] - predict_binary_b2)^2)
##head(data_testing,5)
data_split <- initial_split(data, prop=0.8,strata= default)
data_training <- training(data_split)
data_testing <- testing(data_split)

#ii
model_b2 <- glm(default ~ income + balance, data= data_training, family ="binomial")
predict_b2 <- predict(model_b2, new_data= data_testing, type="response")

#iii
predict_binary_b2 <- ifelse(predict_b2 > 0.5, "Yes", "No")

#iv
##mean((default[data_testing] - predict_binary_b2)^2)
##head(data_testing,5)

D: Including dummy variable

model_d <- glm(default ~ income + balance + student, data=data_training, family= "binomial")
predict_d <- predict(model_d, new_data= data_testing, type="response")
predict_binary_d <- ifelse(predict_d> 0.5, "Yes", "No")
##mean((default[data_testing] - predict_binary_b2)^2)
##head(data_testing,5)