### Utilizing Regression Model to Predict Credit card Defaults###
##The data set resides in the ISLR package of R. it contains selected variables and data for 10 000 credit card users##
#packages
pacman::p_load(pacman,dplyr,GGally,ggvis,ggthemes,ggplot2,
rio,lubridate,shiny,tidyr,plotly,psych,rmarkdown,httr,tidyverse,stringr)
#STUDENT - A binary factor containing whether or not a given credit card holder is a student
#INCOME -The gross annual income for a given credit card holder
#BALANCE - The total credit card balance for a given credit card holder
#DEFAULT -A binary factor containing whether or not a given user has defaulted on his/her credit card
##The objective of our investigation is to fit a model such that the relevant predictors of credit card default are explicitly shown given this variables##
#Income, Balance , and Default##
# Is there a relationship between income , balance and student status such that one, two or perhaps all of these variables might be useful to pedict credit card default?
library(ISLR)
##assigning the data to a data frame or variable called Credit card data##
Credit_card.data<- as.data.frame(Default)
head(Credit_card.data)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
summary(Credit_card.data)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
#CONVERTING THE NO AND YES TO NUMBERS FOR EASY USE IN GRAPHICS AND ALL##
Credit_card.data$default<-as.numeric(Credit_card.data$default)
##Lets Visualize our data to try and see if there are clear patterns in our data or relationships##
plot(Credit_card.data$balance, Credit_card.data$income ,col="blue",ylab="income", xlab = "balance",
main = "Balance as a function of :Income",pch=19)

##There seems to be no clear relationship between income and credit card balance
##now we look at the trend#
B<-table(Credit_card.data$student)
B
##
## No Yes
## 7056 2944
barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "STUDENT STATUS",
main = "BAR GRAPH OF INCOME AND WHETHER STUDENT OR NOT")
box()

# THERE ARE 2944 STUDENTS WITH CREDIT CARDS IN OUR DATA SET
# 7056 ARE NOT STUDENTS
B<-table(Credit_card.data$default)
B
##
## 1 2
## 9667 333
barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS",
main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()

# THERE WERE A TOTAL OF 333 DEFAULTS AND A STUDENT
B<-table(Credit_card.data$default[Credit_card.data$student=="Yes"])
B
##
## 1 2
## 2817 127
barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS PER STUDENT",
main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()
# THERE WERE A TOTAL OF 127 DEFAULTS THAT BELONGED TO STUDENTS
(127/333)*100
## [1] 38.13814
#SO AMONGST THE 333 DEFAULTS 38% OF THE DAFAULTS WERE STUDENTS AND 61.86% WERE NOT STUDENTS.
## Fit a line(regression and lowess)
(1-127/333)*100
## [1] 61.86186
abline(lm(Credit_card.data$income~Credit_card.data$balance),col="red") #linear trend line
lines(lowess(Credit_card.data$balance,Credit_card.data$income),col="magenta") #fitted trend line

###GGPLOT##
##revert plot points shape to that of the original plot default#
ggplot(Credit_card.data,
aes(x=balance,
y=income))+geom_point(shape=1)

##add regression line (linear trend)##
ggplot(Credit_card.data,
aes(x=balance,
y=income))+
geom_point(shape=1)+
geom_smooth(method = lm,
col="blue",
se= FALSE)##including se takes out the inter confidence#
## `geom_smooth()` using formula 'y ~ x'

##next we add a confidence interval##
ggplot(Credit_card.data,
aes(x=balance,
y=income))+
geom_point(shape=1)+
geom_smooth(method = lm,
col="blue")
## `geom_smooth()` using formula 'y ~ x'

##add a curved trend line with confidence intervals
ggplot(Credit_card.data,
aes(x=balance,
y=income))+
geom_point(col="red")+
geom_smooth(col="yellow")+
labs(x="Balance",y="Income",title="Income vs Balance")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

par(mfrow=c(2,1))
## Differentiate by default status##
ggplot(Credit_card.data,
aes(x=balance,
y= income,col=default,fill=student))+
geom_point()+
geom_smooth()+
labs(x="Balance in Dollars",
y="Income in Dollars",
title="Balance on Income and Default Status")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##scatter plot seems to suggest that there's a relationship between default status and balance while income is not related##
##we seem to have more defaults from around the balance of 1000 dollars and increases with credit card balance
##The defaults rate is spread evenly above and below our fitted trend line
##most of the defaults were from student accounts, it seems they tend to have
## a credit card balance of around $1000 and above while their income is low
## Differentiate by student status##
ggplot(Credit_card.data,
aes(x=balance,
y=income,col=student))+
geom_point()+
geom_smooth(col="blue")+
labs(x="Balance in Dollars",
y="Income in Dollars",
title="Balance on Income per student status")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##The scatter plot between income and balance shows us that the students incomes is less than $40 000 dollars
##And they make up most of the defaults since most of the student balance is located to the right of $1000 putting them in the high risk zone of default
##another fact is that its evident that the students income lie below the trend line
boxplot(Credit_card.data$income,xlab="INCOME",
main="Credit Card:Income",col="purple", horizontal = T)
boxplot(Credit_card.data$balance,col="blue",
horizontal = T,xlab="Balance in Dollars",
main="Boxplot of Balance")

min(Credit_card.data$income)
## [1] 771.9677
max(Credit_card.data$income)
## [1] 73554.23
min(Credit_card.data$balance)
## [1] 0
max(Credit_card.data$balance)
## [1] 2654.323
#The minimum income in our study is $771.97 with the max being $73 554.23
#And balance goes as follows, min = $ 0 with the max being at $2654
summary(Credit_card.data)
## default student balance income
## Min. :1.000 No :7056 Min. : 0.0 Min. : 772
## 1st Qu.:1.000 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median :1.000 Median : 823.6 Median :34553
## Mean :1.033 Mean : 835.4 Mean :33517
## 3rd Qu.:1.000 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2.000 Max. :2654.3 Max. :73554
##There are 2944 students in our data set from a population of 10 000 with only
#333 defaults on credit cards and we saw in the scatter plot that most students fall in the default zone at balance of above $1000
##MODEL#
#The plots suggest that credit card balance, but not income, is a useful predictor of default status
#However to be thorough in our investigations or research we will begin by fitting all parameters to a model of logistic form
#we chose this particular model to fit this credit card data because:
#the model does well if the number of parameters is low
#highly interpretable
#quick operating time in R
#fits the binary nature of our problem well
###Regression##
#modeling,analysis of variance, descriptives and inferences
#model1
Md1 <- lm(default~balance+student+income ,data=Credit_card.data)
Md1
##
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
##
## Coefficients:
## (Intercept) balance studentYes income
## 9.188e-01 1.327e-04 -1.033e-02 1.992e-07
summary(Md1)
##
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24610 -0.06979 -0.02645 0.02018 0.98542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.188e-01 8.382e-03 109.617 <2e-16 ***
## balance 1.327e-04 3.547e-06 37.412 <2e-16 ***
## studentYes -1.033e-02 5.663e-03 -1.824 0.0682 .
## income 1.992e-07 1.917e-07 1.039 0.2990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.168 on 9996 degrees of freedom
## Multiple R-squared: 0.124, Adjusted R-squared: 0.1238
## F-statistic: 471.7 on 3 and 9996 DF, p-value: < 2.2e-16
confint(Md1)
## 2.5 % 97.5 %
## (Intercept) 9.023900e-01 9.352511e-01
## balance 1.257375e-04 1.396420e-04
## studentYes -2.143107e-02 7.708687e-04
## income -1.766720e-07 5.749776e-07
anova(Md1)
## Analysis of Variance Table
##
## Response: default
## Df Sum Sq Mean Sq F value Pr(>F)
## balance 1 39.461 39.461 1398.826 < 2.2e-16 ***
## student 1 0.432 0.432 15.305 9.207e-05 ***
## income 1 0.030 0.030 1.079 0.299
## Residuals 9996 281.988 0.028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##MD1's Adjusted R^2 is = 0.1238
##and the summary of MD! tells us that balance is indeed a useful predictor of default status and as well as whether or not the account holdr is a student or not
#model2 (income left out)
Md2 <- lm(default~balance+student,data = Credit_card.data)
Md2
##
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
##
## Coefficients:
## (Intercept) balance studentYes
## 0.9267835 0.0001327 -0.0147253
summary(Md2)
##
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.24406 -0.06966 -0.02638 0.02023 0.98665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.268e-01 3.390e-03 273.405 < 2e-16 ***
## balance 1.327e-04 3.547e-06 37.414 < 2e-16 ***
## studentYes -1.473e-02 3.764e-03 -3.912 9.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.168 on 9997 degrees of freedom
## Multiple R-squared: 0.1239, Adjusted R-squared: 0.1237
## F-statistic: 707.1 on 2 and 9997 DF, p-value: < 2.2e-16
confint(Md2)
## 2.5 % 97.5 %
## (Intercept) 0.9201388164 0.9334281294
## balance 0.0001257446 0.0001396492
## studentYes -0.0221034535 -0.0073471508
anova(Md2)
## Analysis of Variance Table
##
## Response: default
## Df Sum Sq Mean Sq F value Pr(>F)
## balance 1 39.461 39.461 1398.815 < 2.2e-16 ***
## student 1 0.432 0.432 15.305 9.208e-05 ***
## Residuals 9997 282.018 0.028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#MD2's Adjusted R^2 is the same as that of MD1 being 0.1237 but with fewer variable to predict the data
##this further suggest the lack of relationship between income and default status
#model3 ( student status left out)
Md3 <- lm(default~balance, data = Credit_card.data)
Md3
##
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
##
## Coefficients:
## (Intercept) balance
## 0.9248080 0.0001299
summary(Md3)
##
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.23533 -0.06939 -0.02628 0.02004 0.99046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.248e-01 3.354e-03 275.70 <2e-16 ***
## balance 1.299e-04 3.475e-06 37.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1681 on 9998 degrees of freedom
## Multiple R-squared: 0.1226, Adjusted R-squared: 0.1225
## F-statistic: 1397 on 1 and 9998 DF, p-value: < 2.2e-16
confint(Md3)
## 2.5 % 97.5 %
## (Intercept) 0.9182328199 0.9313832624
## balance 0.0001230606 0.0001366838
anova(Md3)
## Analysis of Variance Table
##
## Response: default
## Df Sum Sq Mean Sq F value Pr(>F)
## balance 1 39.461 39.461 1396.8 < 2.2e-16 ***
## Residuals 9998 282.450 0.028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#MD3's Adjusted R^2 is 0.1225
cor(Credit_card.data$default,Credit_card.data$balance)
## [1] 0.3501192
#model4 (balance left out)
Md4 <- lm(default~student, data = Credit_card.data)
Md4
##
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
##
## Coefficients:
## (Intercept) studentYes
## 1.02920 0.01394
summary(Md4)
##
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04314 -0.04314 -0.02920 -0.02920 0.97080
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.029195 0.002135 482.101 < 2e-16 ***
## studentYes 0.013944 0.003935 3.544 0.000396 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1793 on 9998 degrees of freedom
## Multiple R-squared: 0.001255, Adjusted R-squared: 0.001155
## F-statistic: 12.56 on 1 and 9998 DF, p-value: 0.000396
confint(Md4)
## 2.5 % 97.5 %
## (Intercept) 1.025010351 1.03337967
## studentYes 0.006231145 0.02165601
anova(Md4)
## Analysis of Variance Table
##
## Response: default
## Df Sum Sq Mean Sq F value Pr(>F)
## student 1 0.40 0.40387 12.559 0.000396 ***
## Residuals 9998 321.51 0.03216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cor(Credit_card.data$default,Credit_card.data$income)
## [1] -0.01987145
## it is clear that if we remove balance as a factor the adjusted R^2 reduces and the predictive power of our model significantly drops
##this goes to further solidify the usefulness of Credit card balance as a predictor of credit card defaults status
##it should be noted that student status increases the probability of default
Md1$coefficients
## (Intercept) balance studentYes income
## 9.188205e-01 1.326897e-04 -1.033010e-02 1.991528e-07
##predicting whether a credit card user will default or not closer to 1= no closer to 2= yes
#balance = $1500 ,student= yes or =2 ,income= $25 000
9.291506e-01 + 1.326897e-04*(1500)-1.033010e-02*(2)+1.991528e-07*(25000)
## [1] 1.112504
#the mean default is 1.033
# the response is 1.11 which so we can say that since the response is closer t the mean and to 1 the candidate will not is more likely to not default
#2. balance = $7000 ,student= yes or =2 ,income= $50 and a student
9.291506e-01 + 1.326897e-04*(7000)-1.033010e-02*(2)+1.991528e-07*(50)
## [1] 1.837328
## the model suggest that balance plays a crucial role in whether or not an individual defautls or not
## as we increase the credit card balance but reduce the income we see that the response gets closer to 2 and as the balance gradually grows the response grows to
##thus credit card balance plays a major role in whether or not the client defaults or not and inversely
#proportional to credit card income.
par(mfrow=c(1,1))
#model training and decision trees
library(rpart)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(vcd)
## Loading required package: grid
##
## Attaching package: 'vcd'
## The following object is masked from 'package:ISLR':
##
## Hitters
library(Metrics)
D1 <-Credit_card.data
d1.ori<-D1
set.seed(99)
tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]
#reset the original training and test data
tr1 <- tr
te1 <- te
te2 <-te
#zero r strategy no one will default
te2$default <- rep(0,nrow(te2))
#building the tree
tr1$default = as.factor(tr1$default)
fit1 <- rpart(formula=default ~ .,
data=tr1,
control=rpart.control(minsplit=20, minbucket=1, cp=0.08))
fit1
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 166 1 (0.96680000 0.03320000)
## 2) balance< 1846.077 4898 93 1 (0.98101266 0.01898734) *
## 3) balance>=1846.077 102 29 2 (0.28431373 0.71568627) *
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2670553 142.7 4551668 243.1 4345035 232.1
## Vcells 6037978 46.1 12255594 93.6 12254840 93.5
fancyRpartPlot(fit1)

printcp(fit1)
##
## Classification tree:
## rpart(formula = default ~ ., data = tr1, control = rpart.control(minsplit = 20,
## minbucket = 1, cp = 0.08))
##
## Variables actually used in tree construction:
## [1] balance
##
## Root node error: 166/5000 = 0.0332
##
## n= 5000
##
## CP nsplit rel error xerror xstd
## 1 0.26506 0 1.00000 1.00000 0.076316
## 2 0.08000 1 0.73494 0.88554 0.071957
print(fit1)
## n= 5000
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 5000 166 1 (0.96680000 0.03320000)
## 2) balance< 1846.077 4898 93 1 (0.98101266 0.01898734) *
## 3) balance>=1846.077 102 29 2 (0.28431373 0.71568627) *
fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]
## [1] 0.08
#compare with base model
Prediction<-predict(fit1,te1,type="class")
#Update the prediction
te2$Purchase <- Prediction
Pred = factor(as.factor(te2$default))
Actual = factor(as.factor(te1$default))
table(te1$default)
##
## 1 2
## 4833 167
cm1 = confusionMatrix(Actual,Pred)
cm1
## FP TP TN FN
## 0 5000 0 0
## attr(,"negative")
## [1] "control"