## Homework 6: Intro to Generalized Linear Models

a. Using the train_data data set fit the model (2 point)

# load data set
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.8, list = FALSE)
train_data <- Default[train_index, ]
test_data <- Default[-train_index, ]

# define the outcome variable: default
train_data$default <- ifelse(train_data$default == "Yes", 1, 0)

# fit logistical regression model
logit_model <- glm(default ~ balance + income + student, data = train_data, family = binomial)
summary(logit_model)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.129e+01  5.666e-01 -19.927   <2e-16 ***
## balance      5.788e-03  2.619e-04  22.105   <2e-16 ***
## income       1.214e-05  9.274e-06   1.309   0.1906    
## studentYes  -5.467e-01  2.679e-01  -2.041   0.0413 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1248.1  on 7997  degrees of freedom
## AIC: 1256.1
## 
## Number of Fisher Scoring iterations: 8
# remove income and refit model
logit_model_improved <- glm(default ~ balance + student, data = train_data, family = binomial)
summary(logit_model_improved)
## 
## Call:
## glm(formula = default ~ balance + student, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.080e+01  4.154e-01 -26.003  < 2e-16 ***
## balance      5.788e-03  2.615e-04  22.129  < 2e-16 ***
## studentYes  -8.183e-01  1.674e-01  -4.889 1.01e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1249.8  on 7998  degrees of freedom
## AIC: 1255.8
## 
## Number of Fisher Scoring iterations: 8

b. Calculate the Confusion Matrix for the Test data set (2 point)

# predict probabilities of belonging to class 1 (default)
predicted_probabilities <- predict(logit_model_improved, type = "response")

# predicted labels based on a threshold of 0.5
predicted_labels <- ifelse(predicted_probabilities > 0.5, 1, 0)

# create the confusion matrix
conf_matrix <- table(Actual = train_data$default, Predicted = predicted_labels)
conf_matrix
##       Predicted
## Actual    0    1
##      0 7707   27
##      1  179   88

c. Calculate accuracy, precision, and recall (2 point)

# calculate accuracy
correct_predictions <- sum(diag(conf_matrix))
total_predictions <- sum(conf_matrix)
accuracy <- correct_predictions / total_predictions
accuracy
## [1] 0.9742532
# calculate precision
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
precision
## [1] 0.7652174
# calculate recall
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
recall
## [1] 0.329588

d. Calculate the F1 score (2 point)

# calculate the f1 score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score
## [1] 0.460733

e. Extract the coefficients and their log odds (1 point)

# calculate log odds
odds_ratios <- exp(coef(logit_model_improved))
odds_ratios
##  (Intercept)      balance   studentYes 
## 2.038786e-05 1.005804e+00 4.411904e-01

f. Calculate the deviance (1 point)

# calculate the deviance
dev <- deviance(logit_model_improved)
dev
## [1] 1249.837