## Remove rows that do not have target variable values
final <- x[!(is.na(x$default.payment.next.month)),]
library(caTools)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(88)
split <- sample.split(final$default.payment.next.month, SplitRatio = 0.75)
dresstrain <- subset(final, split == TRUE)
dresstest <- subset(final, split == FALSE)
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
dresstrain$default.payment.next.month <- as.factor(dresstrain$default.payment.next.month)
dresstest$default.payment.next.month <- as.factor(dresstest$default.payment.next.month)
rf = randomForest(default.payment.next.month~.,
ntree = 100,
data = dresstrain)
plot(rf)

varImp(rf)
## Overall
## ID 511.20584
## LIMIT_BAL 372.63264
## SEX 74.76610
## EDUCATION 132.16961
## MARRIAGE 91.27863
## AGE 401.43206
## PAY_0 717.82297
## PAY_2 355.23851
## PAY_3 224.47560
## PAY_4 163.23384
## PAY_5 143.28872
## PAY_6 164.07542
## BILL_AMT1 427.82303
## BILL_AMT2 387.65018
## BILL_AMT3 367.98343
## BILL_AMT4 361.55032
## BILL_AMT5 349.59847
## BILL_AMT6 353.40670
## PAY_AMT1 370.41753
## PAY_AMT2 343.84676
## PAY_AMT3 329.83085
## PAY_AMT4 302.63583
## PAY_AMT5 310.36254
## PAY_AMT6 329.43405
## Important variables according to the model
varImpPlot(rf,
sort = T,
n.var=25,
main="Variable Importance")

predicted.response <- predict(rf, dresstest)
confusionMatrix(data=predicted.response,
reference=dresstest$default.payment.next.month)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 5497 1025
## 1 344 634
##
## Accuracy : 0.8175
## 95% CI : (0.8085, 0.8262)
## No Information Rate : 0.7788
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.379
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9411
## Specificity : 0.3822
## Pos Pred Value : 0.8428
## Neg Pred Value : 0.6483
## Prevalence : 0.7788
## Detection Rate : 0.7329
## Detection Prevalence : 0.8696
## Balanced Accuracy : 0.6616
##
## 'Positive' Class : 0
##