This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
#EDA for Taiwan Credit Card data
library(data.table)
# reading data as data.table
CCdefault.dt <- fread("MCICreditCardDefault.csv")
# attaching the data
attach(CCdefault.dt)
# dimension of the data table
dim(CCdefault.dt)
## [1] 29601 9
# column names
colnames(CCdefault.dt)
## [1] "Id" "CreditLimit" "Male" "Education"
## [5] "MaritalStatus" "Age" "BillOutstanding" "LastPayment"
## [9] "Default"
# structure of the dataframe
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : int 0 0 0 0 1 1 1 0 0 1 ...
## $ Education : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : int 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : int 1 1 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
#Data tyoe conversions
# convert 'Male' as a factor
CCdefault.dt[, Male := as.factor(Male)]
# convert 'Education' as a factor
CCdefault.dt[, Education := as.factor(Education)]
# convert 'MaritalStatus' as a factor
CCdefault.dt[, MaritalStatus := as.factor(MaritalStatus)]
# convert 'Default' as a factor
CCdefault.dt[, Default := as.factor(Default)]
# verifying conversion
str(CCdefault.dt)
## Classes 'data.table' and 'data.frame': 29601 obs. of 9 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CreditLimit : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ Male : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 2 ...
## $ Education : Factor w/ 4 levels "1","2","3","4": 2 2 2 2 2 1 1 2 3 3 ...
## $ MaritalStatus : Factor w/ 3 levels "1","2","3": 1 2 2 1 1 2 2 2 1 2 ...
## $ Age : int 24 26 34 37 57 37 29 23 28 35 ...
## $ BillOutstanding: int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ LastPayment : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ Default : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
#Checking the levels of 'Default variable'
levels(CCdefault.dt$Default) <- c("No","Yes")
# levels of the target variable
levels(CCdefault.dt$Default)
## [1] "No" "Yes"
#Reordering the levels of 'Default variable'
CCdefault.dt$Default <- ordered(CCdefault.dt$Default, levels = c("Yes", "No"))
# verifying the new order of levels
levels(CCdefault.dt$Default)
## [1] "Yes" "No"
#Data partition
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
set.seed(2341)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
#Verifying the attributes of partitioned data sets
# dimension of training dataset
dim(trainData.dt)
## [1] 23681 9
# dimension of testing dataset
dim(testData.dt)
## [1] 5920 9
# proportion of defaulters in training dataset
round(prop.table(table(trainData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
# proportion of defaulters in test dataset
round(prop.table(table(testData.dt$Default))*100,2)
##
## Yes No
## 22.31 77.69
#KNN algorithm implementation
#KNN model development
set.seed(3333)
library(caret)
# Set control parameters
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(3333)
# Run kNN Classifier in package caret
knn_fit <- train(Default ~ .,
data = trainData.dt,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
# kNN model summary
knn_fit
## k-Nearest Neighbors
##
## 23681 samples
## 8 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (11), scaled (11)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21313, 21314, 21313, 21313, 21312, 21314, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7327261 0.04264164
## 7 0.7467312 0.04021462
## 9 0.7529805 0.02928387
## 11 0.7581885 0.02459777
## 13 0.7627492 0.02378213
## 15 0.7662680 0.02270087
## 17 0.7686751 0.01919706
## 19 0.7708850 0.01794406
## 21 0.7722223 0.01641531
## 23 0.7731231 0.01487522
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
# predicting the test set observations
kNNPred <- predict(knn_fit, testData.dt, type = "prob")
#Setting cut off probability to 20%
classify20KNN <- ifelse(kNNPred$Yes > 0.20,"Yes","No")
# ordering the levels
classify20KNN <- ordered(classify20KNN, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
#Confusion matrix
cmKNN <- table(Predicted = classify20KNN, Actual = testData.dt$Default)
confusionMatrix(cmKNN)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 942 2547
## No 379 2052
##
## Accuracy : 0.5057
## 95% CI : (0.4929, 0.5186)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1005
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7131
## Specificity : 0.4462
## Pos Pred Value : 0.2700
## Neg Pred Value : 0.8441
## Prevalence : 0.2231
## Detection Rate : 0.1591
## Detection Prevalence : 0.5894
## Balanced Accuracy : 0.5796
##
## 'Positive' Class : Yes
##
#ROC curve and AUC
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
PredLRKNN <- predict(knn_fit, testData.dt,type = "prob")
levels(testData.dt$Default)
## [1] "Yes" "No"
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
lgPredObjKNN <- prediction(PredLRKNN[1],testData.dt$Default)
lgPerfObjKNN <- performance(lgPredObjKNN, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObjKNN,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# area under curve
aucLRKNN <- performance(lgPredObjKNN, measure = "auc")
aucLRKNN <- aucLRKNN@y.values[[1]]
aucLRKNN
## [1] 0.3914298
#Logistic regression model implementation
#Data partition
set.seed(766)
trainIndex <- createDataPartition(CCdefault.dt$Default, p = 0.80, list = FALSE)
# 80% training data
trainData.dt <- CCdefault.dt[trainIndex, ]
# 20% testing data
testData.dt <- CCdefault.dt[-trainIndex, ]
#Logit model development
logitModel <- glm(trainData.dt$Default ~ .,
data = trainData.dt,
family = binomial())
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = trainData.dt$Default ~ ., family = binomial(),
## data = trainData.dt)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.6831 0.3601 0.6479 0.7764 1.0039
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.461e-01 9.101e-02 8.198 2.44e-16 ***
## Id 2.501e-06 1.844e-06 1.357 0.175
## CreditLimit 3.266e-06 1.628e-07 20.059 < 2e-16 ***
## Male1 -1.637e-01 3.253e-02 -5.032 4.84e-07 ***
## Education2 -2.807e-02 3.769e-02 -0.745 0.456
## Education3 1.417e-02 5.042e-02 0.281 0.779
## Education4 1.309e+00 4.236e-01 3.090 0.002 **
## MaritalStatus2 2.149e-01 3.681e-02 5.838 5.29e-09 ***
## MaritalStatus3 6.957e-02 1.475e-01 0.472 0.637
## Age -2.990e-03 1.978e-03 -1.511 0.131
## BillOutstanding -1.889e-06 2.634e-07 -7.170 7.52e-13 ***
## LastPayment 3.168e-05 3.250e-06 9.748 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 23680 degrees of freedom
## Residual deviance: 24225 on 23669 degrees of freedom
## AIC: 24249
##
## Number of Fisher Scoring iterations: 6
# predicting the test set observations
logitModelPred <- predict(logitModel, testData.dt, type = "response")
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")
#Setting the cutoff probability to 20%
classify20LGT <- ifelse(logitModelPred > 0.2,"Yes","No")
# ordering the levels
classify20LGT <- ordered(classify20LGT, levels = c("Yes", "No"))
testData.dt$Default <- ordered(testData.dt$Default, levels = c("Yes", "No"))
# confusion matrix
cmLGT <- table(Predicted = classify20LGT, Actual = testData.dt$Default)
confusionMatrix(cmLGT)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 1321 4599
## No 0 0
##
## Accuracy : 0.2231
## 95% CI : (0.2126, 0.234)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.2231
## Neg Pred Value : NaN
## Prevalence : 0.2231
## Detection Rate : 0.2231
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Yes
##
#ROC curve and AUG
# loading the package
library(ROCR)
PredLR <- predict(logitModel, testData.dt,type = "response")
lgPredObj <- prediction(PredLR,testData.dt$Default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.6318744