Predict on the Risky Vs Good Customer on Fraud check data and its attributes using Random Forest.

#install.packages("randomForest")
#install.packages("Mass")
#install.packages("caret")
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.5.1

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(MASS)
library(caret)

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.5.1

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

# USe the set.seed function so that we get same results each time 
set.seed(123)


FraudCheck <- read.csv(file.choose())
hist(FraudCheck$Taxable.Income)

hist(FraudCheck$Taxable.Income, main = "Sales of Companydata",xlim = c(0,100000),
     breaks=c(seq(40,60,80)), col = c("blue","red", "green","violet"))

Risky_Good = ifelse(FraudCheck$Taxable.Income<= 30000, "Risky", "Good")
# if Taxable Income is less than or equal to 30000 then Risky else Good.
FCtemp= data.frame(FraudCheck,Risky_Good)
FC = FCtemp[,c(1:7)]

str(FC)

## 'data.frame':    600 obs. of  7 variables:
##  $ Undergrad      : Factor w/ 2 levels "NO","YES": 1 2 1 2 1 1 1 2 1 2 ...
##  $ Marital.Status : Factor w/ 3 levels "Divorced","Married",..: 3 1 2 3 2 1 1 3 3 1 ...
##  $ Taxable.Income : int  68833 33700 36925 50190 81002 33329 83357 62774 83519 98152 ...
##  $ City.Population: int  50047 134075 160205 193264 27533 116382 80890 131253 102481 155482 ...
##  $ Work.Experience: int  10 18 30 15 28 0 8 3 12 4 ...
##  $ Urban          : Factor w/ 2 levels "NO","YES": 2 2 2 2 1 1 2 2 2 2 ...
##  $ Risky_Good     : Factor w/ 2 levels "Good","Risky": 1 1 1 1 1 1 1 1 1 1 ...

table(FC$Risky_Good)   # 476 good customers and 124 risky customers

## 
##  Good Risky 
##   476   124

# Data Partition
set.seed(123)
ind <- sample(2, nrow(FC), replace = TRUE, prob = c(0.7,0.3))
train <- FC[ind==1,]
test  <- FC[ind==2,]
set.seed(213)
rf <- randomForest(Risky_Good~., data=train)
rf  # Description of the random forest with no of trees, mtry = no of variables for splitting

## 
## Call:
##  randomForest(formula = Risky_Good ~ ., data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.47%
## Confusion matrix:
##       Good Risky class.error
## Good   343     1 0.002906977
## Risky    1    79 0.012500000

                  # each tree node.
   # Out of bag estimate of error rate is 0.47 % in Random Forest Model.
attributes(rf)

## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"

# Prediction and Confusion Matrix - Training data 
pred1 <- predict(rf, train)
head(pred1)

##    1    3    6    7    9   10 
## Good Good Good Good Good Good 
## Levels: Good Risky

head(train$Risky_Good)

## [1] Good Good Good Good Good Good
## Levels: Good Risky

# looks like the first six predicted value and original value matches.

confusionMatrix(pred1, train$Risky_Good)   # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Risky
##      Good   344     0
##      Risky    0    80
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9913, 1)
##     No Information Rate : 0.8113     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8113     
##          Detection Rate : 0.8113     
##    Detection Prevalence : 0.8113     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Good       
##

 # more than 98% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# Prediction with test data - Test Data 
pred2 <- predict(rf, test)
confusionMatrix(pred2, test$Risky_Good) # 100 % accuracy on test data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Risky
##      Good   132     0
##      Risky    0    44
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9793, 1)
##     No Information Rate : 0.75       
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.00       
##             Specificity : 1.00       
##          Pos Pred Value : 1.00       
##          Neg Pred Value : 1.00       
##              Prevalence : 0.75       
##          Detection Rate : 0.75       
##    Detection Prevalence : 0.75       
##       Balanced Accuracy : 1.00       
##                                      
##        'Positive' Class : Good       
##

# Error Rate in Random Forest Model :
plot(rf)

# at 200 there is a constant line and it doesnot vary after 200 trees

# Tune Random Forest Model mtry 
tune <- tuneRF(train[,-6], train[,6], stepFactor = 0.5, plot = TRUE, ntreeTry = 300,
       trace = TRUE, improve = 0.05)

## mtry = 2  OOB error = 43.16% 
## Searching left ...
## mtry = 4     OOB error = 44.1% 
## -0.02185792 0.05 
## Searching right ...
## mtry = 1     OOB error = 43.16% 
## 0 0.05

rf1 <- randomForest(Risky_Good~., data=train, ntree = 200, mtry = 2, importance = TRUE,
                   proximity = TRUE)
rf1

## 
## Call:
##  randomForest(formula = Risky_Good ~ ., data = train, ntree = 200,      mtry = 2, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.24%
## Confusion matrix:
##       Good Risky class.error
## Good   343     1 0.002906977
## Risky    0    80 0.000000000

pred1 <- predict(rf1, train)
confusionMatrix(pred1, train$Risky_Good)  # 100 % accuracy on training data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Risky
##      Good   344     0
##      Risky    0    80
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9913, 1)
##     No Information Rate : 0.8113     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8113     
##          Detection Rate : 0.8113     
##    Detection Prevalence : 0.8113     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Good       
##

  # Around 99% Confidence Interval. 
  # Sensitivity for Yes and No is 100 % 

# test data prediction using the Tuned RF1 model
pred2 <- predict(rf1, test)
confusionMatrix(pred2, test$Risky_Good) # 100 % accuracy on test data

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Risky
##      Good   132     0
##      Risky    0    44
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9793, 1)
##     No Information Rate : 0.75       
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.00       
##             Specificity : 1.00       
##          Pos Pred Value : 1.00       
##          Neg Pred Value : 1.00       
##              Prevalence : 0.75       
##          Detection Rate : 0.75       
##    Detection Prevalence : 0.75       
##       Balanced Accuracy : 1.00       
##                                      
##        'Positive' Class : Good       
##

# Confidence Interval is around 97 % 


# no of nodes of trees

hist(treesize(rf1), main = "No of Nodes for the trees", col = "green")

# Majority of the trees has an average number of more than 80 nodes. 

# Variable Importance :

varImpPlot(rf1)

# Mean Decrease Accuracy graph shows that how worst the model performs without each variable.
# say Taxable.Income is the most important variable for prediction.on looking at City population,it has no value.

# MeanDecrease gini graph shows how much by average the gini decreases if one of those nodes were 
# removed. Taxable.Income is very important and Urban is not that important.

varImpPlot(rf1 ,Sort = T, n.var = 5, main = "Top 5 -Variable Importance")

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

## Warning in mtext(labs, side = 2, line = loffset, at = y, adj = 0, col =
## color, : "Sort" is not a graphical parameter

## Warning in title(main = main, xlab = xlab, ylab = ylab, ...): "Sort" is not
## a graphical parameter

# Quantitative values 
importance(rf1)

##                       Good      Risky MeanDecreaseAccuracy
## Undergrad        0.9697747  0.7452060            1.0746869
## Marital.Status   2.3190683 -0.1647737            1.6152058
## Taxable.Income  57.7118526 74.8748950           64.0732787
## City.Population  0.6655007 -2.5773995           -1.4062559
## Work.Experience  0.1951129 -1.0500176           -0.8912347
## Urban            1.2305502  0.1132935            0.9469538
##                 MeanDecreaseGini
## Undergrad              0.6916259
## Marital.Status         1.8147131
## Taxable.Income       115.2485651
## City.Population        4.5044196
## Work.Experience        3.0332879
## Urban                  0.5027630

varUsed(rf)   # which predictor variables are actually used in the random forest.

## [1]  315  492 1719 1275  962  325

# Partial Dependence Plot 
partialPlot(rf1, train, Taxable.Income, "Good")

# On that graph, i see that if the taxable Income is 30000 or greater,
# than they are good customers else those are risky customers.
# Extract single tree from the forest :

tr1 <- getTree(rf1, 2, labelVar = TRUE)

# Multi Dimension scaling plot of proximity Matrix
MDSplot(rf1, FC$Risky_Good)

## Warning in RColorBrewer::brewer.pal(nlevs, "Set1"): minimal value for n is 3, returning requested palette with 3 different levels