#loading library ISL2
library(ISLR2)
#installing and loading library tidyverse for computation and visualization
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data("Default")
#Default%>%head()%>%View()
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
# Calculate percentage of each category
percentage_data <- Default %>%
group_by(default) %>%
summarise(percentage = n() / nrow(Default) * 100)
library(dplyr)
# Convert 'student' column to numeric (Yes: 1, No: 0)
Default <- Default %>%
mutate(student = ifelse(student == "Yes", 1, 0))
# Convert 'default' column to numeric (Yes: 1, No: 0)
Default <- Default %>%
mutate(default = ifelse(default == "Yes", 1, 0))
# Plot side-by-side bar charts in the same facet with y-axis as percentage
ggplot(percentage_data, aes(x = default, y = percentage, fill = default)) +
geom_col(position = position_dodge(width = 0.1), alpha = 0.7) +
geom_text(aes(label = paste0(round(percentage), "%")),
position = position_dodge(width = 0.2), vjust = -0.5, size = 4) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) +
labs(title = "Distribution of Default status", x = "Default", y = "Percentage") +
theme_minimal() +
theme(legend.position = "none") +
scale_fill_manual(values = c("No" = "red", "Yes" = "green")) +
guides(fill = guide_legend(title = "Default", override.aes = list(alpha = 1))) +
coord_flip()
### EDA - crosstabulation
#Cross tabulate student status and default
# Cross-tabulate student and default variables
# Cross-tabulate student and default variables
# Update the 1 to Yes and 0 to No and then display the crosstab
cross_tab <- Default %>%
count(student, default) %>%
mutate(percent = n / sum(n) * 100) %>%
mutate(default = ifelse(default == 1, "Yes", "No")) %>% # Update 1 to Yes and 0 to No
mutate(student = ifelse(student == 1, "Yes", "No")) %>%
pivot_wider(names_from = student, values_from = c(n, percent), names_prefix = "student_")
cross_tab
## # A tibble: 2 × 5
## default n_student_No n_student_Yes percent_student_No percent_student_Yes
## <chr> <int> <int> <dbl> <dbl>
## 1 No 6850 2817 68.5 28.2
## 2 Yes 206 127 2.06 1.27
#Boxplot of income against default and balance against default
#This will help us understand if there is an immediate relationship
ggplot(Default) +
geom_boxplot(mapping = aes(x = factor(default, labels = c("No", "Yes")), y = income, fill = factor(default), group = factor(default))) +
scale_x_discrete(name = "Default", labels = c("No", "Yes")) +
labs(title = 'Distribution of Default status on Income') +
theme(panel.background = element_rect(fill = "white"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplot(Default)+
geom_boxplot(mapping = aes(x = factor(default, labels = c("No", "Yes")), y = balance, fill = factor(default), group = factor(default))) +
scale_x_discrete(name = "Default", labels = c("No", "Yes")) +
labs(title = 'Distribution of Default status on Balance')+
theme(panel.background = element_rect(fill = "white"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
index=createDataPartition(y=Default$default,p=0.8,list=F)
train_data_def=Default[index,]
test_data_def=Default[-index,]
#fitting the model
model=glm(default~.,data=train_data_def,family=binomial(link=logit))
#predict on the test dataset
test_data_def$predicted_default_status=predict(model,newdata=test_data_def,type='response')
#link
test_data_def$predicted_link=predict(model,newdata=test_data_def,type='link')
#terms
test_data_def$predicted_terms=predict(model,newdata=test_data_def,type='terms')
# Convert column to normal form
test_data_def <- test_data_def %>%
mutate(predicted_default_status = format(predicted_default_status, scientific = FALSE))
summary(model)
##
## Call:
## glm(formula = default ~ ., family = binomial(link = logit), data = train_data_def)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.098e+01 5.588e-01 -19.648 < 2e-16 ***
## student -7.548e-01 2.626e-01 -2.875 0.00405 **
## balance 5.841e-03 2.647e-04 22.063 < 2e-16 ***
## income 1.463e-06 9.196e-06 0.159 0.87359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2333.8 on 7999 degrees of freedom
## Residual deviance: 1239.0 on 7996 degrees of freedom
## AIC: 1247
##
## Number of Fisher Scoring iterations: 8
#view test data
test_data_def%>%view()
# Convert test_data_def to data.table
test_data_def_dt <- as.data.table(test_data_def)
# Print the data.table
tibble(test_data_def_dt)
## # A tibble: 2,000 × 9
## default student balance income predicted_default_status predicted_link
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 0 0 1074. 31767. 0.009356201358 -4.66
## 2 0 0 786. 38463. 0.001771688204 -6.33
## 3 0 1 809. 17600. 0.000924878903 -6.98
## 4 0 0 0 29275. 0.000017798796 -10.9
## 5 0 1 1221. 13269. 0.010097341149 -4.59
## 6 0 0 914. 46907. 0.003779140874 -5.57
## 7 0 1 1500. 13191. 0.049498916557 -2.96
## 8 0 0 653. 39490. 0.000818944826 -7.11
## 9 0 0 837. 51472. 0.002439325702 -6.01
## 10 0 0 409. 54207. 0.000200937046 -8.51
## # ℹ 1,990 more rows
## # ℹ 3 more variables: predicted_terms.student <dbl>,
## # predicted_terms.balance <dbl>, predicted_terms.income <dbl>