# Read Libraries
library(readr)
library(caret)
library(randomForest)
library(dplyr)
library(mlbench)
## This dataset contains information on default payments, demographic factors, credit data, ## of credit card clients in Taiwan from April 2005 to September 2005.
## There are 25 varialbes in the data set. Default payment (1=yes, 0=no) is factor variable
# Read Credit Card Data
UCI_Credit_Card <-
read_csv("C:/Users/pdbro/OneDrive/Desktop/UCI_Credit_Card.csv")
View(UCI_Credit_Card)
str(UCI_Credit_Card)
# Factor Analysis of Default Payment
UCI_Credit_Card$default.payment.next.month <- as.factor(UCI_Credit_Card$default.payment.next.month)
str(UCI_Credit_Card)
## spc_tbl_ [30,000 × 25] (S3: ## spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:30000] ## 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : num [1:30000] ## 20000 120000 90000 50000 50000 50000 500000 ## 100000 140000 20000 ...
## $ SEX : num [1:30000] ## 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : num [1:30000] ## 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : num [1:30000] ## 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : num [1:30000] ## 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] ## 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] ## 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] ## -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] ## -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] ## -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] ## -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] ## 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] ## 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] ## 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] ## 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] ##0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] ## 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] ## 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] ## 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] ## 0 1000 1000 1200 10000 657 38000 0 432 0 ...
$ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
$ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
$ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
$ default.payment.next.month: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
- attr(*, "spec")=
# Train Data
# Split Data into Training and Testing
set.seed(1234)
index <- sample(2,nrow(UCI_Credit_Card), replace = TRUE, prob=c(0.7,0.3))
#Training data
Training <- UCI_Credit_Card[index==1, ]
#Testing data
Testing <- UCI_Credit_Card[index==2, ]
# Random Forest Model
rf = randomForest(default.payment.next.month~., data = Training)
print(rf)
## Call:
## randomForest(formula =
## default.payment.next.month ~ ., data =
## Training)
## Type of random forest:
## classification
## Number of trees: 500
## No. of variables tried at each split: 4
## OOB estimate of error rate: 18%
## Confusion matrix:
## 0 1 class.error
## 0 15468 880 0.05382921
## 1 2896 1728 0.62629758
## rf$confusion
## 0 1 class.error
## 0 15468 880 0.05382921
## 1 2896 1728 0.62629758
# Evaluate/Test Model
p2 <- predict(rf, Testing)
confusionMatrix(p2, Testing$default.payment.next.month)
## Confusion Matrix and Statistics
## Reference
## Prediction 0 1
## 0 6624 1274
## 1 392 738
## Accuracy : 0.8155
## 95% CI : (0.8073, 0.8234)
## No Information Rate : 0.7771
## P-Value [Acc > NIR] : < 2.2e-16
## Kappa : 0.3685
## Mcnemar's Test P-Value : < 2.2e-16
## Sensitivity : 0.9441
## Specificity : 0.3668
## Pos Pred Value : 0.8387
## Neg Pred Value : 0.6531
## Prevalence : 0.7771
## Detection Rate : 0.7337
## Detection Prevalence : 0.8748
## Balanced Accuracy : 0.6555
## 'Positive' Class : 0
# Model Improvement
# Create model with default paramters
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
set.seed(seed)
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(default.payment.next.month~., data=Training, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
print(rf_default)
Random Forest
20972 samples
24 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 18874, 18875, 18874, 18875, 18874, 18875, ...
Resampling results:
Accuracy Kappa
0.8201576 0.3812661
Tuning parameter 'mtry' was held constant at
a value of 7.745967
#importance of variables
varImpPlot(rf, n.var = 10, main = "Top 10 Important Variables")
PLOT