# Read Libraries

library(readr)
library(caret)
library(randomForest)
library(dplyr)
library(mlbench)

## This dataset contains information on default payments, demographic factors, credit data, ## of credit card clients in Taiwan from April 2005 to September 2005.
## There are 25 varialbes in the data set. Default payment (1=yes, 0=no) is factor variable
# Read Credit Card Data
UCI_Credit_Card <- 
read_csv("C:/Users/pdbro/OneDrive/Desktop/UCI_Credit_Card.csv")

View(UCI_Credit_Card)
str(UCI_Credit_Card)
# Factor Analysis of Default Payment

UCI_Credit_Card$default.payment.next.month <- as.factor(UCI_Credit_Card$default.payment.next.month)
str(UCI_Credit_Card)


## spc_tbl_ [30,000 × 25] (S3: ## spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID                        : num [1:30000] ## 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL                 : num [1:30000] ## 20000 120000 90000 50000 50000 50000 500000 ## 100000 140000 20000 ...
## $ SEX                       : num [1:30000] ## 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION                 : num [1:30000] ## 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE                  : num [1:30000] ## 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE                       : num [1:30000] ## 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0                     : num [1:30000] ## 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2                     : num [1:30000] ## 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3                     : num [1:30000] ## -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4                     : num [1:30000] ## -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5                     : num [1:30000] ## -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6                     : num [1:30000] ## -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1                 : num [1:30000] ## 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2                 : num [1:30000] ## 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3                 : num [1:30000] ## 689 2682 13559 49291 35835 ...
## $ BILL_AMT4                 : num [1:30000] ## 0 3272 14331 28314 20940 ...
## $ BILL_AMT5                 : num [1:30000] ##0 3455 14948 28959 19146 ...
## $ BILL_AMT6                 : num [1:30000] ## 0 3261 15549 29547 19131 ...
## $ PAY_AMT1                  : num [1:30000] ## 0 0 1518 2000 2000 ...
## $ PAY_AMT2                  : num [1:30000] ## 689 1000 1500 2019 36681 ...
## $ PAY_AMT3                  : num [1:30000] ## 0 1000 1000 1200 10000 657 38000 0 432 0 ...
 $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
 $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
 $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
 $ default.payment.next.month: Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
 - attr(*, "spec")=
# Train Data
# Split Data into Training and Testing
set.seed(1234)
index <- sample(2,nrow(UCI_Credit_Card), replace = TRUE, prob=c(0.7,0.3))

#Training data
Training <- UCI_Credit_Card[index==1, ]

#Testing data
Testing <- UCI_Credit_Card[index==2, ]

# Random Forest Model
rf = randomForest(default.payment.next.month~., data = Training)
print(rf)
## Call:
## randomForest(formula = 
## default.payment.next.month ~ ., data = 
## Training) 
##               Type of random forest: 
## classification
##                     Number of trees: 500
## No. of variables tried at each split: 4

##        OOB estimate of  error rate: 18%
## Confusion matrix:
##      0    1 class.error
## 0 15468  880  0.05382921
## 1  2896 1728  0.62629758

## rf$confusion
##      0    1 class.error
## 0 15468  880  0.05382921
## 1  2896 1728  0.62629758
# Evaluate/Test Model

p2 <- predict(rf, Testing) 
confusionMatrix(p2, Testing$default.payment.next.month)
## Confusion Matrix and Statistics

##          Reference
## Prediction    0    1
##         0 6624 1274
##         1  392  738
                                          
##               Accuracy : 0.8155          
##                 95% CI : (0.8073, 0.8234)
##    No Information Rate : 0.7771          
##    P-Value [Acc > NIR] : < 2.2e-16       
                                          
##                  Kappa : 0.3685          
                                          
## Mcnemar's Test P-Value : < 2.2e-16       
                                          
##            Sensitivity : 0.9441          
##            Specificity : 0.3668          
##         Pos Pred Value : 0.8387          
##         Neg Pred Value : 0.6531          
##             Prevalence : 0.7771          
##         Detection Rate : 0.7337          
##   Detection Prevalence : 0.8748          
##      Balanced Accuracy : 0.6555          
                                          
##       'Positive' Class : 0               
                            
# Model Improvement
# Create model with default paramters
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
set.seed(seed)
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(default.payment.next.month~., data=Training, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
print(rf_default)
Random Forest 

20972 samples
   24 predictor
    2 classes: '0', '1' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 18874, 18875, 18874, 18875, 18874, 18875, ... 
Resampling results:

  Accuracy   Kappa    
  0.8201576  0.3812661

Tuning parameter 'mtry' was held constant at
 a value of 7.745967
#importance of variables
varImpPlot(rf, n.var = 10, main = "Top 10 Important Variables")

PLOT