Replicate - Default and Smarket analysis

#loading library ISL2
library(ISLR2)
#installing and loading library tidyverse for computation and visualization
#install.packages("tidyverse")
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data("Default")
#Default%>%head()%>%View()
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

EDA to understand patterns of default

# Calculate percentage of each category
percentage_data <- Default %>%
  group_by(default) %>%
  summarise(percentage = n() / nrow(Default) * 100)
library(dplyr)

# Convert 'student' column to numeric (Yes: 1, No: 0)
Default <- Default %>% 
  mutate(student = ifelse(student == "Yes", 1, 0))

# Convert 'default' column to numeric (Yes: 1, No: 0)
Default <- Default %>% 
  mutate(default = ifelse(default == "Yes", 1, 0))

# Plot side-by-side bar charts in the same facet with y-axis as percentage
ggplot(percentage_data, aes(x = default, y = percentage, fill = default)) +
  geom_col(position = position_dodge(width = 0.1), alpha = 0.7) +


  geom_text(aes(label = paste0(round(percentage), "%")), 
            position = position_dodge(width = 0.2), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(title = "Distribution of Default status", x = "Default", y = "Percentage") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_fill_manual(values = c("No" = "red", "Yes" = "green")) +
  guides(fill = guide_legend(title = "Default", override.aes = list(alpha = 1))) +
  coord_flip()

### EDA - crosstabulation

#Cross tabulate student status and default
# Cross-tabulate student and default variables
# Cross-tabulate student and default variables
# Update the 1 to Yes and 0 to No and then display the crosstab
cross_tab <- Default %>%
  count(student, default) %>%
  mutate(percent = n / sum(n) * 100) %>%
  mutate(default = ifelse(default == 1, "Yes", "No")) %>%  # Update 1 to Yes and 0 to No
  mutate(student = ifelse(student == 1, "Yes", "No")) %>%
  pivot_wider(names_from = student, values_from = c(n, percent), names_prefix = "student_")

cross_tab

## # A tibble: 2 × 5
##   default n_student_No n_student_Yes percent_student_No percent_student_Yes
##   <chr>          <int>         <int>              <dbl>               <dbl>
## 1 No              6850          2817              68.5                28.2 
## 2 Yes              206           127               2.06                1.27

#Boxplot of income against default and balance against default
#This will help us understand if there is an immediate relationship 

ggplot(Default) +
  geom_boxplot(mapping = aes(x = factor(default, labels = c("No", "Yes")), y = income, fill = factor(default), group = factor(default))) +
  scale_x_discrete(name = "Default", labels = c("No", "Yes")) +
  labs(title = 'Distribution of Default status on Income') +
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplot(Default)+
  geom_boxplot(mapping = aes(x = factor(default, labels = c("No", "Yes")), y = balance, fill = factor(default), group = factor(default))) +
  scale_x_discrete(name = "Default", labels = c("No", "Yes")) +
  labs(title = 'Distribution of Default status on Balance')+
  theme(panel.background = element_rect(fill = "white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Modelling

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

index=createDataPartition(y=Default$default,p=0.8,list=F)
train_data_def=Default[index,]
test_data_def=Default[-index,]
#fitting the model
model=glm(default~.,data=train_data_def,family=binomial(link=logit))
#predict on the test dataset
test_data_def$predicted_default_status=predict(model,newdata=test_data_def,type='response')
#link
test_data_def$predicted_link=predict(model,newdata=test_data_def,type='link')
#terms
test_data_def$predicted_terms=predict(model,newdata=test_data_def,type='terms')
# Convert column to normal form
test_data_def <- test_data_def %>% 
  mutate(predicted_default_status = format(predicted_default_status, scientific = FALSE))

summary(model)

## 
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = train_data_def)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.098e+01  5.588e-01 -19.648  < 2e-16 ***
## student     -7.548e-01  2.626e-01  -2.875  0.00405 ** 
## balance      5.841e-03  2.647e-04  22.063  < 2e-16 ***
## income       1.463e-06  9.196e-06   0.159  0.87359    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2333.8  on 7999  degrees of freedom
## Residual deviance: 1239.0  on 7996  degrees of freedom
## AIC: 1247
## 
## Number of Fisher Scoring iterations: 8

#view test data

test_data_def%>%view()
# Convert test_data_def to data.table
test_data_def_dt <- as.data.table(test_data_def)

# Print the data.table
tibble(test_data_def_dt)

## # A tibble: 2,000 × 9
##    default student balance income predicted_default_status predicted_link
##      <dbl>   <dbl>   <dbl>  <dbl> <chr>                             <dbl>
##  1       0       0   1074. 31767. 0.009356201358                    -4.66
##  2       0       0    786. 38463. 0.001771688204                    -6.33
##  3       0       1    809. 17600. 0.000924878903                    -6.98
##  4       0       0      0  29275. 0.000017798796                   -10.9 
##  5       0       1   1221. 13269. 0.010097341149                    -4.59
##  6       0       0    914. 46907. 0.003779140874                    -5.57
##  7       0       1   1500. 13191. 0.049498916557                    -2.96
##  8       0       0    653. 39490. 0.000818944826                    -7.11
##  9       0       0    837. 51472. 0.002439325702                    -6.01
## 10       0       0    409. 54207. 0.000200937046                    -8.51
## # ℹ 1,990 more rows
## # ℹ 3 more variables: predicted_terms.student <dbl>,
## #   predicted_terms.balance <dbl>, predicted_terms.income <dbl>

Replicate - Default and Smarket analysis

Hassan N.

2024-02-21

EDA to understand patterns of default

Modelling