This data set contains the following information about 10,000 individuals:
default: Indicates whether or not an individual defaulted.
student: Indicates whether or not an individual is a student.
balance: Average balance carried by an individual.
income: Income of the individual.
install.packages("ISLR")
## The following package(s) will be installed:
## - ISLR [1.4]
## These packages will be installed into "~/Documents/MA Economics/MA Economics/renv/library/R-4.3/aarch64-apple-darwin20".
##
## # Installing packages --------------------------------------------------------
## - Installing ISLR ... OK [linked from cache]
## Successfully installed 1 package in 2.6 milliseconds.
library(ISLR)
#load dataset
data <- ISLR::Default
summary(data)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
nrow(data)
## [1] 10000
#make this example reproducible
set.seed(10)
#Use 70% of dataset as training set and remaining 30% as testing set
sample <- sample(c(TRUE, FALSE), nrow(data), replace=TRUE, prob=c(0.7,0.3))
#The line 34 creates an index of the observations. 70% of the rows are labelled as True and 30% as False, The rows labelled
#True become part of the training dataset and the rows labelled False become part of the Test dataset.
#Rows can be reused for selection in the training dataset when replace = TRUE is used
write.csv(sample, "sample.csv")
train <- data[sample, ]
test <- data[!sample, ]
#fit logistic regression model
model <- glm(default~student+balance+income, family="binomial", data=train)
#disable scientific notation for model summary
options(scipen=999)
#view model summary
summary(model)
##
## Call:
## glm(formula = default ~ student + balance + income, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -11.137742154 0.589351919 -18.898 <0.0000000000000002 ***
## studentYes -0.528687513 0.276309404 -1.913 0.0557 .
## balance 0.005850292 0.000276911 21.127 <0.0000000000000002 ***
## income 0.000007058 0.000009516 0.742 0.4583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2161.7 on 6972 degrees of freedom
## Residual deviance: 1142.4 on 6969 degrees of freedom
## AIC: 1150.4
##
## Number of Fisher Scoring iterations: 8
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
caret::varImp(model)
## Overall
## studentYes 1.913390
## balance 21.126944
## income 0.741656
#define two individuals for both student status 'Yes' and 'No'
new <- data.frame(balance = 1000, income = 3000, student = c("Yes", "No"))
#predict probability of defaulting
predict(model, new, type="response")
## 1 2
## 0.003033610 0.005136283
#calculate probability of default for each individual in test dataset
predicted <- predict(model, test, type="response")
write.csv(predicted,"predicted.csv")