credit-card-data-part-2-incomplete.R

### Utilizing  Regression Model to Predict Credit card Defaults###

##The data set resides in the ISLR package of R. it contains selected variables and data for 10 000 credit card users##

#packages
pacman::p_load(pacman,dplyr,GGally,ggvis,ggthemes,ggplot2,
               rio,lubridate,shiny,tidyr,plotly,psych,rmarkdown,httr,tidyverse,stringr)

  #STUDENT - A binary factor containing whether or not a given credit card holder is a student
  #INCOME -The gross annual income for a given credit card holder
  #BALANCE - The total credit card balance for a given credit card holder
  #DEFAULT  -A binary factor containing whether or not a given user has defaulted on his/her credit card

##The objective of our investigation is to fit a model such that the relevant predictors of credit card default are explicitly shown given this variables##

#Income, Balance , and Default##
# Is there a relationship between income , balance and student status such that one, two or perhaps all of these variables might be useful to pedict credit card default?

library(ISLR)

##assigning the data to a data frame or variable called Credit card data##
Credit_card.data<- as.data.frame(Default)
head(Credit_card.data)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

summary(Credit_card.data)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

#CONVERTING THE NO AND YES TO NUMBERS FOR EASY USE IN GRAPHICS AND ALL##
Credit_card.data$default<-as.numeric(Credit_card.data$default)

##Lets Visualize our data to try and see if there are clear patterns in our data or relationships##
plot(Credit_card.data$balance, Credit_card.data$income ,col="blue",ylab="income", xlab = "balance",
     main = "Balance as a function of :Income",pch=19)

##There seems to be no clear relationship between income and credit card balance
      ##now we look at the trend#
B<-table(Credit_card.data$student)
B

## 
##   No  Yes 
## 7056 2944

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "STUDENT STATUS",
        main = "BAR GRAPH OF INCOME AND WHETHER STUDENT OR NOT")
box()

# THERE ARE 2944 STUDENTS WITH CREDIT CARDS IN OUR DATA SET
# 7056 ARE NOT STUDENTS


B<-table(Credit_card.data$default)
B

## 
##    1    2 
## 9667  333

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS",
        main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()

# THERE WERE A TOTAL OF 333 DEFAULTS AND A STUDENT
B<-table(Credit_card.data$default[Credit_card.data$student=="Yes"])
B

## 
##    1    2 
## 2817  127

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS PER STUDENT",
        main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()
# THERE WERE A TOTAL OF 127 DEFAULTS THAT BELONGED TO STUDENTS
(127/333)*100

## [1] 38.13814

#SO AMONGST THE 333 DEFAULTS 38% OF THE DAFAULTS WERE STUDENTS AND 61.86% WERE NOT STUDENTS.

## Fit a line(regression and lowess)
(1-127/333)*100

## [1] 61.86186

abline(lm(Credit_card.data$income~Credit_card.data$balance),col="red") #linear trend line
lines(lowess(Credit_card.data$balance,Credit_card.data$income),col="magenta") #fitted trend line

###GGPLOT##



##revert plot points shape to that of the original plot default#
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+geom_point(shape=1)

##add regression line (linear trend)##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(shape=1)+
  geom_smooth(method = lm,
              col="blue",
              se= FALSE)##including se takes out the inter confidence#

## `geom_smooth()` using formula 'y ~ x'

##next we add a confidence interval##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(shape=1)+
  geom_smooth(method = lm,
              col="blue")

## `geom_smooth()` using formula 'y ~ x'

##add a curved trend line with confidence intervals
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(col="red")+
  geom_smooth(col="yellow")+
  labs(x="Balance",y="Income",title="Income vs Balance")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

par(mfrow=c(2,1))
## Differentiate by default status##
ggplot(Credit_card.data,
       aes(x=balance,
           y= income,col=default,fill=student))+
  geom_point()+
  geom_smooth()+
  labs(x="Balance in Dollars",
       y="Income in Dollars",
       title="Balance on Income and Default Status")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##scatter plot seems to suggest that there's a relationship between default status and balance while income is not related##
##we seem to have more defaults from around the balance of 1000 dollars and increases with credit card balance
##The defaults rate is spread evenly above and below our fitted trend line
##most of the defaults were from student accounts, it seems they tend to have
## a credit card balance of around $1000 and above while their income is low


## Differentiate by student status##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income,col=student))+
  geom_point()+
  geom_smooth(col="blue")+
  labs(x="Balance in Dollars",
       y="Income in Dollars",
       title="Balance on Income per student status")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##The scatter plot between income and balance shows us that the students incomes is less than $40 000 dollars
##And they make up most of the defaults since most of the student balance is located to the right of $1000 putting them in the high risk zone of default
##another fact is that its evident that the students income lie below the trend line


boxplot(Credit_card.data$income,xlab="INCOME",
        main="Credit Card:Income",col="purple", horizontal = T)
boxplot(Credit_card.data$balance,col="blue",
        horizontal = T,xlab="Balance in Dollars",
        main="Boxplot of Balance")

min(Credit_card.data$income)

## [1] 771.9677

max(Credit_card.data$income)

## [1] 73554.23

min(Credit_card.data$balance)

## [1] 0

max(Credit_card.data$balance)

## [1] 2654.323

#The minimum income in our study is $771.97 with the max being $73 554.23
#And balance goes as follows, min = $ 0 with the max being at $2654 
summary(Credit_card.data)

##     default      student       balance           income     
##  Min.   :1.000   No :7056   Min.   :   0.0   Min.   :  772  
##  1st Qu.:1.000   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##  Median :1.000              Median : 823.6   Median :34553  
##  Mean   :1.033              Mean   : 835.4   Mean   :33517  
##  3rd Qu.:1.000              3rd Qu.:1166.3   3rd Qu.:43808  
##  Max.   :2.000              Max.   :2654.3   Max.   :73554

##There are 2944 students in our data set from a population of 10 000 with only
#333 defaults on credit cards and we saw in the scatter plot that most students fall in the default zone at balance of above $1000


##MODEL#

#The plots suggest that credit card balance, but not income, is a useful predictor of default status
#However to be thorough in our investigations or research we will begin by fitting all parameters to a model of logistic form
#we chose this particular model to fit this credit card data because:
    #the model does well if the number of parameters is low
    #highly interpretable
    #quick operating time in R
    #fits the binary nature of our problem well


###Regression##

#modeling,analysis of variance, descriptives and inferences

#model1
Md1 <- lm(default~balance+student+income ,data=Credit_card.data)
Md1

## 
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance   studentYes       income  
##   9.188e-01    1.327e-04   -1.033e-02    1.992e-07

summary(Md1)

## 
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24610 -0.06979 -0.02645  0.02018  0.98542 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.188e-01  8.382e-03 109.617   <2e-16 ***
## balance      1.327e-04  3.547e-06  37.412   <2e-16 ***
## studentYes  -1.033e-02  5.663e-03  -1.824   0.0682 .  
## income       1.992e-07  1.917e-07   1.039   0.2990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.168 on 9996 degrees of freedom
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1238 
## F-statistic: 471.7 on 3 and 9996 DF,  p-value: < 2.2e-16

confint(Md1)

##                     2.5 %       97.5 %
## (Intercept)  9.023900e-01 9.352511e-01
## balance      1.257375e-04 1.396420e-04
## studentYes  -2.143107e-02 7.708687e-04
## income      -1.766720e-07 5.749776e-07

anova(Md1)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq  F value    Pr(>F)    
## balance      1  39.461  39.461 1398.826 < 2.2e-16 ***
## student      1   0.432   0.432   15.305 9.207e-05 ***
## income       1   0.030   0.030    1.079     0.299    
## Residuals 9996 281.988   0.028                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##MD1's Adjusted R^2 is = 0.1238
##and the summary of MD! tells us that balance is indeed a useful predictor of default status and as well as whether or not the account holdr is a student or not

#model2 (income left out)
Md2 <- lm(default~balance+student,data = Credit_card.data)
Md2

## 
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance   studentYes  
##   0.9267835    0.0001327   -0.0147253

summary(Md2)

## 
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24406 -0.06966 -0.02638  0.02023  0.98665 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.268e-01  3.390e-03 273.405  < 2e-16 ***
## balance      1.327e-04  3.547e-06  37.414  < 2e-16 ***
## studentYes  -1.473e-02  3.764e-03  -3.912 9.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.168 on 9997 degrees of freedom
## Multiple R-squared:  0.1239, Adjusted R-squared:  0.1237 
## F-statistic: 707.1 on 2 and 9997 DF,  p-value: < 2.2e-16

confint(Md2)

##                     2.5 %        97.5 %
## (Intercept)  0.9201388164  0.9334281294
## balance      0.0001257446  0.0001396492
## studentYes  -0.0221034535 -0.0073471508

anova(Md2)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq  F value    Pr(>F)    
## balance      1  39.461  39.461 1398.815 < 2.2e-16 ***
## student      1   0.432   0.432   15.305 9.208e-05 ***
## Residuals 9997 282.018   0.028                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#MD2's Adjusted R^2 is the same as that of MD1 being 0.1237 but with fewer variable to predict the data
##this further suggest the lack of relationship between income and default status

#model3 ( student status left out)
Md3 <- lm(default~balance, data = Credit_card.data)
Md3

## 
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance  
##   0.9248080    0.0001299

summary(Md3)

## 
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23533 -0.06939 -0.02628  0.02004  0.99046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.248e-01  3.354e-03  275.70   <2e-16 ***
## balance     1.299e-04  3.475e-06   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1681 on 9998 degrees of freedom
## Multiple R-squared:  0.1226, Adjusted R-squared:  0.1225 
## F-statistic:  1397 on 1 and 9998 DF,  p-value: < 2.2e-16

confint(Md3)

##                    2.5 %       97.5 %
## (Intercept) 0.9182328199 0.9313832624
## balance     0.0001230606 0.0001366838

anova(Md3)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## balance      1  39.461  39.461  1396.8 < 2.2e-16 ***
## Residuals 9998 282.450   0.028                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#MD3's Adjusted R^2 is 0.1225

cor(Credit_card.data$default,Credit_card.data$balance)

## [1] 0.3501192

#model4 (balance left out)
Md4 <- lm(default~student, data = Credit_card.data)
Md4

## 
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)   studentYes  
##     1.02920      0.01394

summary(Md4)

## 
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04314 -0.04314 -0.02920 -0.02920  0.97080 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.029195   0.002135 482.101  < 2e-16 ***
## studentYes  0.013944   0.003935   3.544 0.000396 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1793 on 9998 degrees of freedom
## Multiple R-squared:  0.001255,   Adjusted R-squared:  0.001155 
## F-statistic: 12.56 on 1 and 9998 DF,  p-value: 0.000396

confint(Md4)

##                   2.5 %     97.5 %
## (Intercept) 1.025010351 1.03337967
## studentYes  0.006231145 0.02165601

anova(Md4)

## Analysis of Variance Table
## 
## Response: default
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## student      1   0.40 0.40387  12.559 0.000396 ***
## Residuals 9998 321.51 0.03216                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cor(Credit_card.data$default,Credit_card.data$income)

## [1] -0.01987145

## it is clear that if we remove balance as a factor the adjusted R^2 reduces and the predictive power of our model significantly drops
##this goes to further solidify the usefulness of Credit card balance as a predictor of credit card defaults status

##it should be noted that student status increases the probability of default
Md1$coefficients

##   (Intercept)       balance    studentYes        income 
##  9.188205e-01  1.326897e-04 -1.033010e-02  1.991528e-07

##predicting whether a credit card user will default or not closer to 1= no closer to 2= yes
#balance = $1500 ,student= yes or =2 ,income= $25 000
9.291506e-01 + 1.326897e-04*(1500)-1.033010e-02*(2)+1.991528e-07*(25000)

## [1] 1.112504

#the mean default is 1.033
# the response is 1.11 which so we can say that since the response is closer t the mean and to 1 the candidate will not is more likely to not default

#2. balance = $7000 ,student= yes or =2 ,income= $50 and a student 
9.291506e-01 + 1.326897e-04*(7000)-1.033010e-02*(2)+1.991528e-07*(50)

## [1] 1.837328

## the model suggest that balance plays a crucial role in whether or not an individual defautls or not
## as we increase the credit card balance but reduce the income we see that the response gets closer to 2 and as the balance gradually grows the response grows to
##thus credit card balance plays a major role in whether or not the client defaults or not and inversely
#proportional to credit card income.
par(mfrow=c(1,1))


#model training and decision trees

library(rpart)
library(rattle)

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(vcd)

## Loading required package: grid

## 
## Attaching package: 'vcd'

## The following object is masked from 'package:ISLR':
## 
##     Hitters

library(Metrics)

D1 <-Credit_card.data
d1.ori<-D1
set.seed(99)

tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]

#reset the original training and test data
tr1 <- tr
te1  <- te
te2 <-te

#zero r strategy no one will default
te2$default <- rep(0,nrow(te2))

#building the tree
tr1$default = as.factor(tr1$default)
fit1 <- rpart(formula=default ~ .,
              data=tr1,
              control=rpart.control(minsplit=20, minbucket=1, cp=0.08))
fit1

## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 166 1 (0.96680000 0.03320000)  
##   2) balance< 1846.077 4898  93 1 (0.98101266 0.01898734) *
##   3) balance>=1846.077 102  29 2 (0.28431373 0.71568627) *

gc()

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2670553 142.7    4551668 243.1  4345035 232.1
## Vcells 6037978  46.1   12255594  93.6 12254840  93.5

fancyRpartPlot(fit1)

printcp(fit1)

## 
## Classification tree:
## rpart(formula = default ~ ., data = tr1, control = rpart.control(minsplit = 20, 
##     minbucket = 1, cp = 0.08))
## 
## Variables actually used in tree construction:
## [1] balance
## 
## Root node error: 166/5000 = 0.0332
## 
## n= 5000 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.26506      0   1.00000 1.00000 0.076316
## 2 0.08000      1   0.73494 0.88554 0.071957

print(fit1)

## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 166 1 (0.96680000 0.03320000)  
##   2) balance< 1846.077 4898  93 1 (0.98101266 0.01898734) *
##   3) balance>=1846.077 102  29 2 (0.28431373 0.71568627) *

fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]

## [1] 0.08

#compare with base model

Prediction<-predict(fit1,te1,type="class")
#Update the prediction

te2$Purchase <- Prediction

Pred = factor(as.factor(te2$default))
Actual = factor(as.factor(te1$default))
table(te1$default)

## 
##    1    2 
## 4833  167

cm1 = confusionMatrix(Actual,Pred)
cm1

##   FP   TP   TN   FN 
##    0 5000    0    0 
## attr(,"negative")
## [1] "control"

credit-card-data-part-2-incomplete.R

xholi

2021-01-05