Predict whether customer will upgrade the credit card

Load the library function

library(tidyverse)

Read the file and explore the dimension

setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Statistical BootCamp/WK4")
credit<- read.csv("CreditCard.csv")
dim(credit)

## [1] 30  3

Conver the varibles to varible class

credit$upgraded <- factor(credit$upgraded)
credit$extraCards <- factor(credit$extraCards)

Create the logisitc regression model

creditFit <- glm(upgraded~purchases + extraCards, family = binomial, data = credit)
summary(creditFit)

## 
## Call:
## glm(formula = upgraded ~ purchases + extraCards, family = binomial, 
##     data = credit)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.94314  -0.34400  -0.09307   0.52178   1.64705  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -6.93984    2.94722  -2.355   0.0185 *
## purchases    0.13947    0.06807   2.049   0.0405 *
## extraCards1  2.77434    1.19270   2.326   0.0200 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 41.054  on 29  degrees of freedom
## Residual deviance: 20.077  on 27  degrees of freedom
## AIC: 26.077
## 
## Number of Fisher Scoring iterations: 6

Likelihood ratio test: significant of the difference between the full model and the null model

null hypothesis: full model = null model

alternate hypothesis; full model != null model

attach(creditFit)
pchisq(null.deviance-deviance, df.null-df.residual, lower.tail = FALSE)

## [1] 2.785488e-05

Output the logistic function outcome for all cases

prob <- predict(creditFit, type = 'response')
prob

##           1           2           3           4           5           6 
## 0.078507444 0.652061596 0.001907707 0.002998992 0.005883080 0.008997473 
##           7           8           9          10          11          12 
## 0.016994091 0.845392469 0.848611291 0.756374788 0.964800466 0.178088474 
##          13          14          15          16          17          18 
## 0.045881153 0.257589631 0.975219032 0.025933578 0.672894639 0.941115828 
##          19          20          21          22          23          24 
## 0.029604487 0.372622722 0.550559062 0.808558568 0.118777166 0.061572863 
##          25          26          27          28          29          30 
## 0.521556408 0.603506473 0.042200980 0.983585990 0.943613699 0.684589849

Set the threshold for predicting Y=1 based on logistic regression. One way is to use the proportion of “Yes” [Upgraded cases] in the data.

threshold <- sum(credit$upgraded == "1")/length(credit$upgraded)
threshold

## [1] 0.4333333

If logistic regression probability > threshold, predict 1, else predict 0.

predict <- ifelse(prob>threshold, 1, 0)
predict

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
##  0  1  0  0  0  0  0  1  1  1  1  0  0  0  1  0  1  1  0  0  1  1  0  0  1 
## 26 27 28 29 30 
##  1  0  1  1  1

Create a contigency table with actuals on rows and predictions on columns based on the entire dataset credit

table(credit$upgraded, predict)

##    predict
##      0  1
##   0 13  4
##   1  2 11

caTools package has a simple function to easily create train and test set

library(caTools)

Generate a random number sequence that can be reproduced to check results thru the seed number.

set.seed(22)

# Randomly split data from Y into 2 sets in predefined ratio while preserving relative ratios of different value in Y
split <- sample.split(credit$upgraded, SplitRatio = 0.7)

# Get training and test data
trainset <- subset(credit, split == TRUE)
testset <- subset(credit, split == FALSE)
trainset

##    upgraded purchases extraCards
## 1         0   32.1007          0
## 2         1   34.3706          1
## 3         0    4.8749          0
## 5         0   12.9783          0
## 7         0   20.6648          0
## 8         1   42.0483          1
## 9         0   42.2264          1
## 10        1   37.9900          1
## 11        1   53.6063          1
## 13        0   27.9999          0
## 14        1   42.1694          0
## 15        1   56.1997          1
## 16        0   23.7609          0
## 19        0   24.7372          0
## 21        0   31.3220          1
## 24        0   30.2280          0
## 25        1   50.3778          0
## 26        0   52.7713          0
## 27        0   27.3728          0
## 29        1   50.0686          1
## 30        1   35.4234          1

testset

##    upgraded purchases extraCards
## 4         0    8.1263          0
## 6         0   16.0471          0
## 12        0   38.7936          0
## 17        0   35.0388          1
## 18        1   49.7388          1
## 20        1   26.1315          1
## 22        1   40.1967          1
## 23        0   35.3899          0
## 28        1   59.2146          1

Develop logistic regression model on Trainset

model <- glm(upgraded~purchases+extraCards, data=trainset, family = binomial)
summary(model)

## 
## Call:
## glm(formula = upgraded ~ purchases + extraCards, family = binomial, 
##     data = trainset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8956  -0.3239  -0.1033   0.6098   1.5050  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -7.22551    3.52837  -2.048   0.0406 *
## purchases    0.15371    0.08106   1.896   0.0579 .
## extraCards1  2.34999    1.37511   1.709   0.0875 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28.682  on 20  degrees of freedom
## Residual deviance: 14.453  on 18  degrees of freedom
## AIC: 20.453
## 
## Number of Fisher Scoring iterations: 6

Confusion matrix on Trainset

probTrainset <- predict(model, type = 'response')
threshold2 <- sum(trainset$upgraded=="1")/length(trainset$upgraded)
predictTrainset <- ifelse(probTrainset>threshold2, "1", "0")
table(trainset$upgraded, predictTrainset)

##    predictTrainset
##     0 1
##   0 9 3
##   1 1 8

Check the accuracy of the model generated by traiset - (True posive + True Negative)/ total of observatios

AccuracyTrain <- mean(predictTrainset == trainset$upgraded)
print(paste('Trainset Accuracy = ', AccuracyTrain))

## [1] "Trainset Accuracy =  0.80952380952381"

Confusion matrix on Testset by using the model created by trainset

probTestset <- predict(model, newdata=testset, type='response')
predictTestset <- ifelse(probTestset>threshold2, "1", "0")
table(testset$upgraded, predictTestset)

##    predictTestset
##     0 1
##   0 4 1
##   1 1 3

check the accuracy of the testset by using model generated by trainset

AccuracyTest <- mean(predictTestset == testset$upgraded)
print(paste('Testset Accuracy =', AccuracyTest))

## [1] "Testset Accuracy = 0.777777777777778"