## Homework 6: Intro to Generalized Linear
Models
# load data set
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.8, list = FALSE)
train_data <- Default[train_index, ]
test_data <- Default[-train_index, ]
# define the outcome variable: default
train_data$default <- ifelse(train_data$default == "Yes", 1, 0)
# fit logistical regression model
logit_model <- glm(default ~ balance + income + student, data = train_data, family = binomial)
summary(logit_model)
##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.129e+01 5.666e-01 -19.927 <2e-16 ***
## balance 5.788e-03 2.619e-04 22.105 <2e-16 ***
## income 1.214e-05 9.274e-06 1.309 0.1906
## studentYes -5.467e-01 2.679e-01 -2.041 0.0413 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2340.6 on 8000 degrees of freedom
## Residual deviance: 1248.1 on 7997 degrees of freedom
## AIC: 1256.1
##
## Number of Fisher Scoring iterations: 8
# remove income and refit model
logit_model_improved <- glm(default ~ balance + student, data = train_data, family = binomial)
summary(logit_model_improved)
##
## Call:
## glm(formula = default ~ balance + student, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.080e+01 4.154e-01 -26.003 < 2e-16 ***
## balance 5.788e-03 2.615e-04 22.129 < 2e-16 ***
## studentYes -8.183e-01 1.674e-01 -4.889 1.01e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2340.6 on 8000 degrees of freedom
## Residual deviance: 1249.8 on 7998 degrees of freedom
## AIC: 1255.8
##
## Number of Fisher Scoring iterations: 8
# predict probabilities of belonging to class 1 (default)
predicted_probabilities <- predict(logit_model_improved, type = "response")
# predicted labels based on a threshold of 0.5
predicted_labels <- ifelse(predicted_probabilities > 0.5, 1, 0)
# create the confusion matrix
conf_matrix <- table(Actual = train_data$default, Predicted = predicted_labels)
conf_matrix
## Predicted
## Actual 0 1
## 0 7707 27
## 1 179 88
# calculate accuracy
correct_predictions <- sum(diag(conf_matrix))
total_predictions <- sum(conf_matrix)
accuracy <- correct_predictions / total_predictions
accuracy
## [1] 0.9742532
# calculate precision
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
precision
## [1] 0.7652174
# calculate recall
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
recall
## [1] 0.329588
# calculate the f1 score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score
## [1] 0.460733
# calculate log odds
odds_ratios <- exp(coef(logit_model_improved))
odds_ratios
## (Intercept) balance studentYes
## 2.038786e-05 1.005804e+00 4.411904e-01
# calculate the deviance
dev <- deviance(logit_model_improved)
dev
## [1] 1249.837