Jackson_Garrett

## Homework 6: Intro to Generalized Linear Models

a. Using the train_data data set fit the model (2 point)

# load data set
set.seed(123)
train_index <- createDataPartition(Default$default, p = 0.8, list = FALSE)
train_data <- Default[train_index, ]
test_data <- Default[-train_index, ]

# define the outcome variable: default
train_data$default <- ifelse(train_data$default == "Yes", 1, 0)

# fit logistical regression model
logit_model <- glm(default ~ balance + income + student, data = train_data, family = binomial)
summary(logit_model)

## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.129e+01  5.666e-01 -19.927   <2e-16 ***
## balance      5.788e-03  2.619e-04  22.105   <2e-16 ***
## income       1.214e-05  9.274e-06   1.309   0.1906    
## studentYes  -5.467e-01  2.679e-01  -2.041   0.0413 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1248.1  on 7997  degrees of freedom
## AIC: 1256.1
## 
## Number of Fisher Scoring iterations: 8

# remove income and refit model
logit_model_improved <- glm(default ~ balance + student, data = train_data, family = binomial)
summary(logit_model_improved)

## 
## Call:
## glm(formula = default ~ balance + student, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.080e+01  4.154e-01 -26.003  < 2e-16 ***
## balance      5.788e-03  2.615e-04  22.129  < 2e-16 ***
## studentYes  -8.183e-01  1.674e-01  -4.889 1.01e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1249.8  on 7998  degrees of freedom
## AIC: 1255.8
## 
## Number of Fisher Scoring iterations: 8

b. Calculate the Confusion Matrix for the Test data set (2 point)

# predict probabilities of belonging to class 1 (default)
predicted_probabilities <- predict(logit_model_improved, type = "response")

# predicted labels based on a threshold of 0.5
predicted_labels <- ifelse(predicted_probabilities > 0.5, 1, 0)

# create the confusion matrix
conf_matrix <- table(Actual = train_data$default, Predicted = predicted_labels)
conf_matrix

##       Predicted
## Actual    0    1
##      0 7707   27
##      1  179   88

c. Calculate accuracy, precision, and recall (2 point)

# calculate accuracy
correct_predictions <- sum(diag(conf_matrix))
total_predictions <- sum(conf_matrix)
accuracy <- correct_predictions / total_predictions
accuracy

## [1] 0.9742532

# calculate precision
precision <- conf_matrix[2, 2] / sum(conf_matrix[, 2])
precision

## [1] 0.7652174

# calculate recall
recall <- conf_matrix[2, 2] / sum(conf_matrix[2, ])
recall

## [1] 0.329588

d. Calculate the F1 score (2 point)

# calculate the f1 score
f1_score <- 2 * (precision * recall) / (precision + recall)
f1_score

## [1] 0.460733

e. Extract the coefficients and their log odds (1 point)

# calculate log odds
odds_ratios <- exp(coef(logit_model_improved))
odds_ratios

##  (Intercept)      balance   studentYes 
## 2.038786e-05 1.005804e+00 4.411904e-01

f. Calculate the deviance (1 point)

# calculate the deviance
dev <- deviance(logit_model_improved)
dev

## [1] 1249.837

Jackson_Garrett_Module6HW

Garrett Jackson

2025-02-24

a. Using the train_data data set fit the model (2 point)

b. Calculate the Confusion Matrix for the Test data set (2 point)

c. Calculate accuracy, precision, and recall (2 point)

d. Calculate the F1 score (2 point)

e. Extract the coefficients and their log odds (1 point)

f. Calculate the deviance (1 point)