#loading library ISL2
library(ISLR2)
#installing and loading library tidyverse for computation and visualization
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("Default")
#Default%>%head()%>%View()
EDA to understand patterns of default
# Calculate percentage of each category
percentage_data <- Default %>%
group_by(default) %>%
summarise(percentage = n() / nrow(Default) * 100)
# Plot side-by-side bar charts in the same facet with y-axis as percentage
ggplot(percentage_data) +
geom_col(aes(x = default, y = percentage, fill = default),
position = position_dodge(width = 0.9),
show.legend = FALSE, alpha = 0.7) +
geom_text(aes(label = paste0(round(percentage), "%"),
y = percentage, x = default),
position = position_dodge(width = 0.9),
vjust = -0.5) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
labs(title = "Percentage of Defaults") +
theme_minimal()

#Cross tabulate student status and default
# Cross-tabulate student and default variables
# Cross-tabulate student and default variables
cross_tab <- Default %>%
count(student, default) %>%
mutate(percent = n / sum(n) * 100) %>%
pivot_wider(names_from = student, values_from = c(n, percent), names_prefix = "student_")
cross_tab
## # A tibble: 2 × 5
## default n_student_No n_student_Yes percent_student_No percent_student_Yes
## <fct> <int> <int> <dbl> <dbl>
## 1 No 6850 2817 68.5 28.2
## 2 Yes 206 127 2.06 1.27
#Fitting the glm model
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
index=createDataPartition(y=Default$default,p=0.8,list=F)
train_data_def=Default[index,]
test_data_def=Default[-index,]
#fitting the model
model=glm(default~.,data=train_data_def,family=binomial(link=logit))
#predict on the test dataset
test_data_def$predicted_default_status=predict(model,newdata=test_data_def,type='response')
summary(model)
##
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = train_data_def)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.099e+01 5.600e-01 -19.632 <2e-16 ***
## studentYes -5.844e-01 2.684e-01 -2.177 0.0295 *
## balance 5.789e-03 2.617e-04 22.119 <2e-16 ***
## income 3.629e-06 9.375e-06 0.387 0.6987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2340.6 on 8000 degrees of freedom
## Residual deviance: 1244.3 on 7997 degrees of freedom
## AIC: 1252.3
##
## Number of Fisher Scoring iterations: 8