Load the library function
library(tidyverse)
Read the file and explore the dimension
setwd("C:/Users/ngsook/Desktop/NUS EBA/Semester 2/Statistical BootCamp/WK4")
credit<- read.csv("CreditCard.csv")
dim(credit)
## [1] 30 3
Conver the varibles to varible class
credit$upgraded <- factor(credit$upgraded)
credit$extraCards <- factor(credit$extraCards)
Create the logisitc regression model
creditFit <- glm(upgraded~purchases + extraCards, family = binomial, data = credit)
summary(creditFit)
##
## Call:
## glm(formula = upgraded ~ purchases + extraCards, family = binomial,
## data = credit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.94314 -0.34400 -0.09307 0.52178 1.64705
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.93984 2.94722 -2.355 0.0185 *
## purchases 0.13947 0.06807 2.049 0.0405 *
## extraCards1 2.77434 1.19270 2.326 0.0200 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 41.054 on 29 degrees of freedom
## Residual deviance: 20.077 on 27 degrees of freedom
## AIC: 26.077
##
## Number of Fisher Scoring iterations: 6
Likelihood ratio test: significant of the difference between the full model and the null model
null hypothesis: full model = null model
alternate hypothesis; full model != null model
attach(creditFit)
pchisq(null.deviance-deviance, df.null-df.residual, lower.tail = FALSE)
## [1] 2.785488e-05
Output the logistic function outcome for all cases
prob <- predict(creditFit, type = 'response')
prob
## 1 2 3 4 5 6
## 0.078507444 0.652061596 0.001907707 0.002998992 0.005883080 0.008997473
## 7 8 9 10 11 12
## 0.016994091 0.845392469 0.848611291 0.756374788 0.964800466 0.178088474
## 13 14 15 16 17 18
## 0.045881153 0.257589631 0.975219032 0.025933578 0.672894639 0.941115828
## 19 20 21 22 23 24
## 0.029604487 0.372622722 0.550559062 0.808558568 0.118777166 0.061572863
## 25 26 27 28 29 30
## 0.521556408 0.603506473 0.042200980 0.983585990 0.943613699 0.684589849
Set the threshold for predicting Y=1 based on logistic regression. One way is to use the proportion of “Yes” [Upgraded cases] in the data.
threshold <- sum(credit$upgraded == "1")/length(credit$upgraded)
threshold
## [1] 0.4333333
If logistic regression probability > threshold, predict 1, else predict 0.
predict <- ifelse(prob>threshold, 1, 0)
predict
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 0 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1
## 26 27 28 29 30
## 1 0 1 1 1
Create a contigency table with actuals on rows and predictions on columns based on the entire dataset credit
table(credit$upgraded, predict)
## predict
## 0 1
## 0 13 4
## 1 2 11
Generate a random number sequence that can be reproduced to check results thru the seed number.
set.seed(22)
# Randomly split data from Y into 2 sets in predefined ratio while preserving relative ratios of different value in Y
split <- sample.split(credit$upgraded, SplitRatio = 0.7)
# Get training and test data
trainset <- subset(credit, split == TRUE)
testset <- subset(credit, split == FALSE)
trainset
## upgraded purchases extraCards
## 1 0 32.1007 0
## 2 1 34.3706 1
## 3 0 4.8749 0
## 5 0 12.9783 0
## 7 0 20.6648 0
## 8 1 42.0483 1
## 9 0 42.2264 1
## 10 1 37.9900 1
## 11 1 53.6063 1
## 13 0 27.9999 0
## 14 1 42.1694 0
## 15 1 56.1997 1
## 16 0 23.7609 0
## 19 0 24.7372 0
## 21 0 31.3220 1
## 24 0 30.2280 0
## 25 1 50.3778 0
## 26 0 52.7713 0
## 27 0 27.3728 0
## 29 1 50.0686 1
## 30 1 35.4234 1
testset
## upgraded purchases extraCards
## 4 0 8.1263 0
## 6 0 16.0471 0
## 12 0 38.7936 0
## 17 0 35.0388 1
## 18 1 49.7388 1
## 20 1 26.1315 1
## 22 1 40.1967 1
## 23 0 35.3899 0
## 28 1 59.2146 1
Develop logistic regression model on Trainset
model <- glm(upgraded~purchases+extraCards, data=trainset, family = binomial)
summary(model)
##
## Call:
## glm(formula = upgraded ~ purchases + extraCards, family = binomial,
## data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8956 -0.3239 -0.1033 0.6098 1.5050
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.22551 3.52837 -2.048 0.0406 *
## purchases 0.15371 0.08106 1.896 0.0579 .
## extraCards1 2.34999 1.37511 1.709 0.0875 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28.682 on 20 degrees of freedom
## Residual deviance: 14.453 on 18 degrees of freedom
## AIC: 20.453
##
## Number of Fisher Scoring iterations: 6
Confusion matrix on Trainset
probTrainset <- predict(model, type = 'response')
threshold2 <- sum(trainset$upgraded=="1")/length(trainset$upgraded)
predictTrainset <- ifelse(probTrainset>threshold2, "1", "0")
table(trainset$upgraded, predictTrainset)
## predictTrainset
## 0 1
## 0 9 3
## 1 1 8
Check the accuracy of the model generated by traiset - (True posive + True Negative)/ total of observatios
AccuracyTrain <- mean(predictTrainset == trainset$upgraded)
print(paste('Trainset Accuracy = ', AccuracyTrain))
## [1] "Trainset Accuracy = 0.80952380952381"
Confusion matrix on Testset by using the model created by trainset
probTestset <- predict(model, newdata=testset, type='response')
predictTestset <- ifelse(probTestset>threshold2, "1", "0")
table(testset$upgraded, predictTestset)
## predictTestset
## 0 1
## 0 4 1
## 1 1 3
check the accuracy of the testset by using model generated by trainset
AccuracyTest <- mean(predictTestset == testset$upgraded)
print(paste('Testset Accuracy =', AccuracyTest))
## [1] "Testset Accuracy = 0.777777777777778"