credit-card-data-part-2-incomplete.R

### Utilizing  Regression Model to Predict Credit card Defaults###

##The data set resides in the ISLR package of R. it contains selected variables and data for 10 000 credit card users##

#packages
pacman::p_load(pacman,dplyr,GGally,ggvis,ggthemes,ggplot2,
               rio,lubridate,shiny,tidyr,plotly,psych,rmarkdown,httr,tidyverse,stringr)

  #STUDENT - A binary factor containing whether or not a given credit card holder is a student
  #INCOME -The gross annual income for a given credit card holder
  #BALANCE - The total credit card balance for a given credit card holder
  #DEFAULT  -A binary factor containing whether or not a given user has defaulted on his/her credit card

##The objective of our investigation is to fit a model such that the relevant predictors of credit card default are explicitly shown given this variables##

#Income, Balance , and Default##
# Is there a relationship between income , balance and student status such that one, two or perhaps all of these variables might be useful to pedict credit card default?

library(ISLR)

##assigning the data to a data frame or variable called Credit card data##
Credit_card.data<- as.data.frame(Default)
head(Credit_card.data)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

summary(Credit_card.data)

##  default    student       balance           income     
##  No :9667   No :7056   Min.   :   0.0   Min.   :  772  
##  Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##                        Median : 823.6   Median :34553  
##                        Mean   : 835.4   Mean   :33517  
##                        3rd Qu.:1166.3   3rd Qu.:43808  
##                        Max.   :2654.3   Max.   :73554

#CONVERTING THE NO AND YES TO NUMBERS FOR EASY USE IN GRAPHICS AND ALL##
Credit_card.data$default<-as.numeric(Credit_card.data$default)

##Lets Visualize our data to try and see if there are clear patterns in our data or relationships##
plot(Credit_card.data$balance, Credit_card.data$income ,col="blue",ylab="income", xlab = "balance",
     main = "Balance as a function of :Income",pch=19)

##There seems to be no clear relationship between income and credit card balance
      ##now we look at the trend#
B<-table(Credit_card.data$student)
B

## 
##   No  Yes 
## 7056 2944

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "STUDENT STATUS",
        main = "BAR GRAPH OF INCOME AND WHETHER STUDENT OR NOT")
box()

# THERE ARE 2944 STUDENTS WITH CREDIT CARDS IN OUR DATA SET
# 7056 ARE NOT STUDENTS


B<-table(Credit_card.data$default)
B

## 
##    1    2 
## 9667  333

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS",
        main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()

# THERE WERE A TOTAL OF 333 DEFAULTS AND A STUDENT
B<-table(Credit_card.data$default[Credit_card.data$student=="Yes"])
B

## 
##    1    2 
## 2817  127

barplot(B,col=rainbow(2),ylab = "INCOME",xlab = "DEFAULT STATUS PER STUDENT",
        main = "BAR GRAPH OF INCOME AND WHETHER DEFAULT OR NOT")
box()
# THERE WERE A TOTAL OF 127 DEFAULTS THAT BELONGED TO STUDENTS
(127/333)*100

## [1] 38.13814

#SO AMONGST THE 333 DEFAULTS 38% OF THE DAFAULTS WERE STUDENTS AND 61.86% WERE NOT STUDENTS.

## Fit a line(regression and lowess)
(1-127/333)*100

## [1] 61.86186

abline(lm(Credit_card.data$income~Credit_card.data$balance),col="red") #linear trend line
lines(lowess(Credit_card.data$balance,Credit_card.data$income),col="magenta") #fitted trend line

###GGPLOT##



##revert plot points shape to that of the original plot default#
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+geom_point(shape=1)

##add regression line (linear trend)##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(shape=1)+
  geom_smooth(method = lm,
              col="blue",
              se= FALSE)##including se takes out the inter confidence#

## `geom_smooth()` using formula 'y ~ x'

##next we add a confidence interval##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(shape=1)+
  geom_smooth(method = lm,
              col="blue")

## `geom_smooth()` using formula 'y ~ x'

##add a curved trend line with confidence intervals
ggplot(Credit_card.data,
       aes(x=balance,
           y=income))+
  geom_point(col="red")+
  geom_smooth(col="yellow")+
  labs(x="Balance",y="Income",title="Income vs Balance")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

par(mfrow=c(2,1))
## Differentiate by default status##
ggplot(Credit_card.data,
       aes(x=balance,
           y= income,col=default,fill=student))+
  geom_point()+
  geom_smooth()+
  labs(x="Balance in Dollars",
       y="Income in Dollars",
       title="Balance on Income and Default Status")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##scatter plot seems to suggest that there's a relationship between default status and balance while income is not related##
##we seem to have more defaults from around the balance of 1000 dollars and increases with credit card balance
##The defaults rate is spread evenly above and below our fitted trend line
##most of the defaults were from student accounts, it seems they tend to have
## a credit card balance of around $1000 and above while their income is low


## Differentiate by student status##
ggplot(Credit_card.data,
       aes(x=balance,
           y=income,col=student))+
  geom_point()+
  geom_smooth(col="blue")+
  labs(x="Balance in Dollars",
       y="Income in Dollars",
       title="Balance on Income per student status")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

##The scatter plot between income and balance shows us that the students incomes is less than $40 000 dollars
##And they make up most of the defaults since most of the student balance is located to the right of $1000 putting them in the high risk zone of default
##another fact is that its evident that the students income lie below the trend line


boxplot(Credit_card.data$income,xlab="INCOME",
        main="Credit Card:Income",col="purple", horizontal = T)
boxplot(Credit_card.data$balance,col="blue",
        horizontal = T,xlab="Balance in Dollars",
        main="Boxplot of Balance")

min(Credit_card.data$income)

## [1] 771.9677

max(Credit_card.data$income)

## [1] 73554.23

min(Credit_card.data$balance)

## [1] 0

max(Credit_card.data$balance)

## [1] 2654.323

#The minimum income in our study is $771.97 with the max being $73 554.23
#And balance goes as follows, min = $ 0 with the max being at $2654 
summary(Credit_card.data)

##     default      student       balance           income     
##  Min.   :1.000   No :7056   Min.   :   0.0   Min.   :  772  
##  1st Qu.:1.000   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
##  Median :1.000              Median : 823.6   Median :34553  
##  Mean   :1.033              Mean   : 835.4   Mean   :33517  
##  3rd Qu.:1.000              3rd Qu.:1166.3   3rd Qu.:43808  
##  Max.   :2.000              Max.   :2654.3   Max.   :73554

##There are 2944 students in our data set from a population of 10 000 with only
#333 defaults on credit cards and we saw in the scatter plot that most students fall in the default zone at balance of above $1000


##MODEL#

#The plots suggest that credit card balance, but not income, is a useful predictor of default status
#However to be thorough in our investigations or research we will begin by fitting all parameters to a model of logistic form
#we chose this particular model to fit this credit card data because:
    #the model does well if the number of parameters is low
    #highly interpretable
    #quick operating time in R
    #fits the binary nature of our problem well


###Regression##

#modeling,analysis of variance, descriptives and inferences

#model1
Md1 <- lm(default~balance+student+income ,data=Credit_card.data)
Md1

## 
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance   studentYes       income  
##   9.188e-01    1.327e-04   -1.033e-02    1.992e-07

summary(Md1)

## 
## Call:
## lm(formula = default ~ balance + student + income, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24610 -0.06979 -0.02645  0.02018  0.98542 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.188e-01  8.382e-03 109.617   <2e-16 ***
## balance      1.327e-04  3.547e-06  37.412   <2e-16 ***
## studentYes  -1.033e-02  5.663e-03  -1.824   0.0682 .  
## income       1.992e-07  1.917e-07   1.039   0.2990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.168 on 9996 degrees of freedom
## Multiple R-squared:  0.124,  Adjusted R-squared:  0.1238 
## F-statistic: 471.7 on 3 and 9996 DF,  p-value: < 2.2e-16

confint(Md1)

##                     2.5 %       97.5 %
## (Intercept)  9.023900e-01 9.352511e-01
## balance      1.257375e-04 1.396420e-04
## studentYes  -2.143107e-02 7.708687e-04
## income      -1.766720e-07 5.749776e-07

anova(Md1)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq  F value    Pr(>F)    
## balance      1  39.461  39.461 1398.826 < 2.2e-16 ***
## student      1   0.432   0.432   15.305 9.207e-05 ***
## income       1   0.030   0.030    1.079     0.299    
## Residuals 9996 281.988   0.028                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##MD1's Adjusted R^2 is = 0.1238
##and the summary of MD! tells us that balance is indeed a useful predictor of default status and as well as whether or not the account holdr is a student or not

#model2 (income left out)
Md2 <- lm(default~balance+student,data = Credit_card.data)
Md2

## 
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance   studentYes  
##   0.9267835    0.0001327   -0.0147253

summary(Md2)

## 
## Call:
## lm(formula = default ~ balance + student, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24406 -0.06966 -0.02638  0.02023  0.98665 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.268e-01  3.390e-03 273.405  < 2e-16 ***
## balance      1.327e-04  3.547e-06  37.414  < 2e-16 ***
## studentYes  -1.473e-02  3.764e-03  -3.912 9.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.168 on 9997 degrees of freedom
## Multiple R-squared:  0.1239, Adjusted R-squared:  0.1237 
## F-statistic: 707.1 on 2 and 9997 DF,  p-value: < 2.2e-16

confint(Md2)

##                     2.5 %        97.5 %
## (Intercept)  0.9201388164  0.9334281294
## balance      0.0001257446  0.0001396492
## studentYes  -0.0221034535 -0.0073471508

anova(Md2)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq  F value    Pr(>F)    
## balance      1  39.461  39.461 1398.815 < 2.2e-16 ***
## student      1   0.432   0.432   15.305 9.208e-05 ***
## Residuals 9997 282.018   0.028                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#MD2's Adjusted R^2 is the same as that of MD1 being 0.1237 but with fewer variable to predict the data
##this further suggest the lack of relationship between income and default status

#model3 ( student status left out)
Md3 <- lm(default~balance, data = Credit_card.data)
Md3

## 
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)      balance  
##   0.9248080    0.0001299

summary(Md3)

## 
## Call:
## lm(formula = default ~ balance, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.23533 -0.06939 -0.02628  0.02004  0.99046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.248e-01  3.354e-03  275.70   <2e-16 ***
## balance     1.299e-04  3.475e-06   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1681 on 9998 degrees of freedom
## Multiple R-squared:  0.1226, Adjusted R-squared:  0.1225 
## F-statistic:  1397 on 1 and 9998 DF,  p-value: < 2.2e-16

confint(Md3)

##                    2.5 %       97.5 %
## (Intercept) 0.9182328199 0.9313832624
## balance     0.0001230606 0.0001366838

anova(Md3)

## Analysis of Variance Table
## 
## Response: default
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## balance      1  39.461  39.461  1396.8 < 2.2e-16 ***
## Residuals 9998 282.450   0.028                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#MD3's Adjusted R^2 is 0.1225

cor(Credit_card.data$default,Credit_card.data$balance)

## [1] 0.3501192

#model4 (balance left out)
Md4 <- lm(default~student, data = Credit_card.data)
Md4

## 
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
## 
## Coefficients:
## (Intercept)   studentYes  
##     1.02920      0.01394

summary(Md4)

## 
## Call:
## lm(formula = default ~ student, data = Credit_card.data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04314 -0.04314 -0.02920 -0.02920  0.97080 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.029195   0.002135 482.101  < 2e-16 ***
## studentYes  0.013944   0.003935   3.544 0.000396 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1793 on 9998 degrees of freedom
## Multiple R-squared:  0.001255,   Adjusted R-squared:  0.001155 
## F-statistic: 12.56 on 1 and 9998 DF,  p-value: 0.000396

confint(Md4)

##                   2.5 %     97.5 %
## (Intercept) 1.025010351 1.03337967
## studentYes  0.006231145 0.02165601

anova(Md4)

## Analysis of Variance Table
## 
## Response: default
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## student      1   0.40 0.40387  12.559 0.000396 ***
## Residuals 9998 321.51 0.03216                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cor(Credit_card.data$default,Credit_card.data$income)

## [1] -0.01987145

## it is clear that if we remove balance as a factor the adjusted R^2 reduces and the predictive power of our model significantly drops
##this goes to further solidify the usefulness of Credit card balance as a predictor of credit card defaults status

##it should be noted that student status increases the probability of default
Md1$coefficients

##   (Intercept)       balance    studentYes        income 
##  9.188205e-01  1.326897e-04 -1.033010e-02  1.991528e-07

##predicting whether a credit card user will default or not closer to 1= no closer to 2= yes
#balance = $1500 ,student= yes or =2 ,income= $25 000
9.291506e-01 + 1.326897e-04*(1500)-1.033010e-02*(2)+1.991528e-07*(25000)

## [1] 1.112504

#the mean default is 1.033
# the response is 1.11 which so we can say that since the response is closer t the mean and to 1 the candidate will not is more likely to not default

#2. balance = $7000 ,student= yes or =2 ,income= $50 and a student 
9.291506e-01 + 1.326897e-04*(7000)-1.033010e-02*(2)+1.991528e-07*(50)

## [1] 1.837328

## the model suggest that balance plays a crucial role in whether or not an individual defautls or not
## as we increase the credit card balance but reduce the income we see that the response gets closer to 2 and as the balance gradually grows the response grows to
##thus credit card balance plays a major role in whether or not the client defaults or not and inversely
#proportional to credit card income.
par(mfrow=c(1,1))

credit-card-data-part-2-incomplete.R

xholi

2021-01-05