#loading library ISL2
library(ISLR2)
#installing and loading library tidyverse for computation and visualization
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("Default")
#Default%>%head()%>%View()

EDA to understand patterns of default

# Calculate percentage of each category
percentage_data <- Default %>%
  group_by(default) %>%
  summarise(percentage = n() / nrow(Default) * 100)

# Plot side-by-side bar charts in the same facet with y-axis as percentage
ggplot(percentage_data) +
  geom_col(aes(x = default, y = percentage, fill = default), 
           position = position_dodge(width = 0.9), 
           show.legend = FALSE, alpha = 0.7) +
  geom_text(aes(label = paste0(round(percentage), "%"), 
                y = percentage, x = default), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(title = "Percentage of Defaults") +
  theme_minimal()

#Cross tabulate student status and default
# Cross-tabulate student and default variables
# Cross-tabulate student and default variables
cross_tab <- Default %>%
  count(student, default) %>%
  mutate(percent = n / sum(n) * 100) %>%
  pivot_wider(names_from = student, values_from = c(n, percent), names_prefix = "student_")

cross_tab
## # A tibble: 2 × 5
##   default n_student_No n_student_Yes percent_student_No percent_student_Yes
##   <fct>          <int>         <int>              <dbl>               <dbl>
## 1 No              6850          2817              68.5                28.2 
## 2 Yes              206           127               2.06                1.27
#Fitting the glm model
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
index=createDataPartition(y=Default$default,p=0.8,list=F)
train_data_def=Default[index,]
test_data_def=Default[-index,]
#fitting the model
model=glm(default~.,data=train_data_def,family=binomial(link=logit))
#predict on the test dataset
test_data_def$predicted_default_status=predict(model,newdata=test_data_def,type='response')
summary(model)
## 
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = train_data_def)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.099e+01  5.600e-01 -19.632   <2e-16 ***
## studentYes  -5.844e-01  2.684e-01  -2.177   0.0295 *  
## balance      5.789e-03  2.617e-04  22.119   <2e-16 ***
## income       3.629e-06  9.375e-06   0.387   0.6987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1244.3  on 7997  degrees of freedom
## AIC: 1252.3
## 
## Number of Fisher Scoring iterations: 8