Simple Logistic Regression applied to the data from the ISLR package in R

This data set contains the following information about 10,000 individuals:

default: Indicates whether or not an individual defaulted.

student: Indicates whether or not an individual is a student.

balance: Average balance carried by an individual.

income: Income of the individual.

install.packages("ISLR")
## The following package(s) will be installed:
## - ISLR [1.4]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
## 
## # Installing packages --------------------------------------------------------
## - Installing ISLR ...                           OK [linked from cache]
## Successfully installed 1 package in 2.6 milliseconds.
library(ISLR)
#load dataset
data <- ISLR::Default

summary(data)
##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554
nrow(data)
## [1] 10000
#make this example reproducible
set.seed(10)

#Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.7,0.3))
#The line 34 creates an index of the observations. 70% of the rows are labelled as True and 30% as False, The rows labelled
#True become part of the training dataset and the rows labelled False become part of the Test dataset.
#Rows can be reused for selection in the training dataset when replace = TRUE is used
write.csv(sample, "sample.csv")
train <- data[sample, ]
test <- data[!sample, ]  

#fit logistic regression model
model <- glm(default~student+balance+income, family="binomial", data=train)

#disable scientific notation for model summary
options(scipen=999)

#view model summary
summary(model)
## 
## Call:
## glm(formula = default ~ student + balance + income, family = "binomial", 
##     data = train)
## 
## Coefficients:
##                  Estimate    Std. Error z value            Pr(>|z|)    
## (Intercept) -11.137742154   0.589351919 -18.898 <0.0000000000000002 ***
## studentYes   -0.528687513   0.276309404  -1.913              0.0557 .  
## balance       0.005850292   0.000276911  21.127 <0.0000000000000002 ***
## income        0.000007058   0.000009516   0.742              0.4583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2161.7  on 6972  degrees of freedom
## Residual deviance: 1142.4  on 6969  degrees of freedom
## AIC: 1150.4
## 
## Number of Fisher Scoring iterations: 8
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
caret::varImp(model)
##              Overall
## studentYes  1.913390
## balance    21.126944
## income      0.741656
#define two individuals for both student status 'Yes' and 'No'
new <- data.frame(balance = 1000, income = 3000, student = c("Yes", "No"))

#predict probability of defaulting
predict(model, new, type="response")
##           1           2 
## 0.003033610 0.005136283
#calculate probability of default for each individual in test dataset
predicted <- predict(model, test, type="response")

write.csv(predicted,"predicted.csv")