Introduction

Should This Loan be Approved or Denied?

In this report, we explore attributes of SBAs loan data from a variety of small business loan records. The sba has offered a collection of data on these small businesses and their loan applications. The businesses in the dataset represent a variety of industries locations, sizes, and other demographic attributes. There are over 800,000 businesses with 27 varables collected over a span of several decades.

We are interested in a few problems and questions besides what have been explored in the class. What are the most important five features in predicting default rate. Is KNN better or Logistic regression better when predicting default rate using the five most important features.

Analysis

Input and Preprocessing data

##load Original Data
library(readr)
SBAnational <- read_csv("~/Desktop/SBAnational.csv", col_types = cols(BalanceGross = col_number(),  DisbursementGross = col_number(),  GrAppv = col_number(), SBA_Appv = col_number()))

#Drop properties with too much distinct values or unrelated
drops <- c("LoanNr_ChkDgt", "Name","City","Zip","NAICS", "ApprovalDate", "ApprovalFY","Bank", "ChgOffDate","DisbursementDate","ChgOffPrinGr")
Ourdata = SBAnational[ , !(names(SBAnational) %in% drops)]

# use sql to preprocess data with valid input
library(sqldf)
Ourdata = sqldf('select * from Ourdata where (MIS_Status = "P I F" or MIS_Status = "CHGOFF") and (NewExist = 1 or NewExist = 2) and (FranchiseCode = 0 or FranchiseCode = 1) and (UrbanRural = 1 or UrbanRural = 2) and (RevLineCr = "Y" or RevLineCr = "N") and (LowDoc = "Y" or LowDoc = "N") and (BankState <> "AN")')

#clean all NUlls
Ourdata = na.omit(Ourdata)

#change char data input to factor
Ourdata$MIS_Status = factor(Ourdata$MIS_Status)
Ourdata$State = factor(Ourdata$State)
Ourdata$BankState = factor(Ourdata$BankState)
Ourdata$RevLineCr = factor(Ourdata$RevLineCr)
Ourdata$LowDoc = factor(Ourdata$LowDoc)

summary(Ourdata)
##      State          BankState           Term           NoEmp     
##  CA     : 47581   NC     : 52293   Min.   :  0.0   Min.   :   0  
##  NY     : 28483   IL     : 36214   1st Qu.: 49.0   1st Qu.:   2  
##  TX     : 21227   CA     : 34245   Median : 84.0   Median :   3  
##  FL     : 19707   RI     : 32949   Mean   : 81.7   Mean   :   8  
##  PA     : 15594   OH     : 29566   3rd Qu.: 84.0   3rd Qu.:   8  
##  OH     : 14999   VA     : 17980   Max.   :527.0   Max.   :8000  
##  (Other):200458   (Other):144802                                 
##     NewExist      CreateJob     RetainedJob   FranchiseCode     UrbanRural  
##  Min.   :1.00   Min.   :   0   Min.   :   0   Min.   :0.000   Min.   :1.00  
##  1st Qu.:1.00   1st Qu.:   0   1st Qu.:   1   1st Qu.:0.000   1st Qu.:1.00  
##  Median :1.00   Median :   0   Median :   2   Median :0.000   Median :1.00  
##  Mean   :1.26   Mean   :   2   Mean   :   6   Mean   :0.499   Mean   :1.18  
##  3rd Qu.:2.00   3rd Qu.:   2   3rd Qu.:   6   3rd Qu.:1.000   3rd Qu.:1.00  
##  Max.   :2.00   Max.   :5085   Max.   :4441   Max.   :1.000   Max.   :2.00  
##                                                                             
##  RevLineCr  LowDoc     DisbursementGross   BalanceGross     MIS_Status    
##  N:159017   N:346622   Min.   :    4000   Min.   :     0   CHGOFF: 92573  
##  Y:189032   Y:  1427   1st Qu.:   27000   1st Qu.:     0   P I F :255476  
##                        Median :   61000   Median :     0                  
##                        Mean   :  157439   Mean   :     6                  
##                        3rd Qu.:  152899   3rd Qu.:     0                  
##                        Max.   :11446325   Max.   :996262                  
##                                                                           
##      GrAppv           SBA_Appv      
##  Min.   :   1000   Min.   :    500  
##  1st Qu.:  25000   1st Qu.:  12500  
##  Median :  50000   Median :  25000  
##  Mean   : 133651   Mean   :  97315  
##  3rd Qu.: 100100   3rd Qu.:  72056  
##  Max.   :5000000   Max.   :4500000  
## 

Feature Engineering

#run randomforest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
train <- sample(nrow(Ourdata), 0.7*nrow(Ourdata), replace = FALSE)
TrainSet <- Ourdata[train,]
ValidSet <- Ourdata[-train,]
model <- randomForest(MIS_Status ~ ., data = TrainSet, ntree = 100, mtry = 5, importance = TRUE)
model
## 
## Call:
##  randomForest(formula = MIS_Status ~ ., data = TrainSet, ntree = 100,      mtry = 5, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 5.66%
## Confusion matrix:
##        CHGOFF  P I F class.error
## CHGOFF  56232   8589     0.13250
## P I F    5191 173622     0.02903
# Predicting on Validation set
predValid <- predict(model, ValidSet, type = "class")
# Checking classification accuracy
accuracy = mean(predValid == ValidSet$MIS_Status)
#confusion matrix
table(predValid,ValidSet$MIS_Status)
##          
## predValid CHGOFF P I F
##    CHGOFF  24123  2208
##    P I F    3629 74455
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
varImpPlot(model) 

The accurqcy of our model is 0.9441.

features = c("State","BankState","Term","FranchiseCode","DisbursementGross","MIS_Status")
temp = Ourdata[ , (names(Ourdata) %in% features)]
Ourdata = temp

Logistic Regression

total_performance = 0
best_logistic = 0
best_score = 0
best_table = 0
for (i in 1:10){
  index_1 = createDataPartition(Ourdata$State, p = 0.85, list = F)
  index_2 = createDataPartition(Ourdata$BankState, p = 0.85, list = F)
  train = intersect(index_1, index_2)
  TrainSet <- Ourdata[train,]
  ValidSet <- Ourdata[-train,]
  glm.fit <- glm(MIS_Status ~ State + BankState + Term + FranchiseCode + DisbursementGross, data = TrainSet, family = binomial)
  #levels(ValidSet$MIS_Status)
  glm.probs <- predict(glm.fit, newdata = ValidSet, type = "response")
  glm.pred <- ifelse(glm.probs > 0.5, "P I F", "CHGOFF")
  cur_score = mean(glm.pred == ValidSet$MIS_Status)
  total_performance = total_performance + cur_score
  if (cur_score > best_score){
    best_score = cur_score
    best_logistic = glm.fit
    best_table = table(glm.pred, ValidSet$MIS_Status)
  }
}
summary(best_logistic)
## 
## Call:
## glm(formula = MIS_Status ~ State + BankState + Term + FranchiseCode + 
##     DisbursementGross, family = binomial, data = TrainSet)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -5.366  -0.596   0.388   0.659   2.884  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -4.87e-01   3.96e-01   -1.23  0.21956    
## StateAL           -1.23e+00   2.24e-01   -5.51  3.5e-08 ***
## StateAR           -1.71e+00   2.34e-01   -7.29  3.1e-13 ***
## StateAZ           -1.30e+00   2.15e-01   -6.05  1.5e-09 ***
## StateCA           -1.15e+00   2.12e-01   -5.40  6.6e-08 ***
## StateCO           -1.22e+00   2.15e-01   -5.68  1.3e-08 ***
## StateCT           -3.50e-01   2.19e-01   -1.60  0.10947    
## StateDC           -8.44e-01   2.47e-01   -3.42  0.00062 ***
## StateDE           -1.05e+00   2.38e-01   -4.40  1.1e-05 ***
## StateFL           -1.47e+00   2.13e-01   -6.90  5.1e-12 ***
## StateGA           -1.67e+00   2.15e-01   -7.77  8.0e-15 ***
## StateHI           -1.62e+00   2.70e-01   -6.03  1.7e-09 ***
## StateIA           -7.11e-01   2.36e-01   -3.01  0.00260 ** 
## StateID           -9.26e-01   2.22e-01   -4.18  2.9e-05 ***
## StateIL           -1.07e+00   2.14e-01   -5.00  5.7e-07 ***
## StateIN           -9.15e-01   2.17e-01   -4.22  2.5e-05 ***
## StateKS           -1.05e+00   2.29e-01   -4.59  4.4e-06 ***
## StateKY           -1.01e+00   2.22e-01   -4.55  5.5e-06 ***
## StateLA           -1.29e+00   2.23e-01   -5.78  7.7e-09 ***
## StateMA           -5.55e-01   2.16e-01   -2.58  0.01000 *  
## StateMD           -9.56e-01   2.17e-01   -4.40  1.1e-05 ***
## StateME           -2.08e-01   2.34e-01   -0.89  0.37412    
## StateMI           -1.11e+00   2.15e-01   -5.17  2.4e-07 ***
## StateMN           -9.05e-01   2.17e-01   -4.16  3.2e-05 ***
## StateMO           -1.13e+00   2.18e-01   -5.17  2.3e-07 ***
## StateMS           -1.87e+00   2.33e-01   -8.02  1.1e-15 ***
## StateMT           -3.54e-01   2.59e-01   -1.37  0.17202    
## StateNC           -7.29e-01   2.17e-01   -3.36  0.00077 ***
## StateND            1.20e-01   2.91e-01    0.41  0.68082    
## StateNE           -6.97e-01   2.46e-01   -2.83  0.00459 ** 
## StateNH           -5.97e-01   2.20e-01   -2.71  0.00666 ** 
## StateNJ           -9.22e-01   2.15e-01   -4.30  1.7e-05 ***
## StateNM           -5.61e-01   2.35e-01   -2.38  0.01716 *  
## StateNV           -1.71e+00   2.19e-01   -7.80  6.1e-15 ***
## StateNY           -8.15e-01   2.13e-01   -3.82  0.00013 ***
## StateOH           -7.64e-01   2.14e-01   -3.57  0.00036 ***
## StateOK           -1.20e+00   2.26e-01   -5.31  1.1e-07 ***
## StateOR           -6.73e-01   2.17e-01   -3.10  0.00195 ** 
## StatePA           -7.57e-01   2.15e-01   -3.52  0.00042 ***
## StateRI           -3.41e-01   2.20e-01   -1.55  0.12192    
## StateSC           -8.53e-01   2.24e-01   -3.80  0.00014 ***
## StateSD            2.30e-01   2.52e-01    0.92  0.35981    
## StateTN           -1.49e+00   2.20e-01   -6.77  1.2e-11 ***
## StateTX           -8.24e-01   2.13e-01   -3.86  0.00011 ***
## StateUT           -9.77e-01   2.18e-01   -4.48  7.6e-06 ***
## StateVA           -6.47e-01   2.17e-01   -2.98  0.00284 ** 
## StateVT           -1.47e-01   2.47e-01   -0.60  0.55140    
## StateWA           -9.43e-01   2.15e-01   -4.38  1.2e-05 ***
## StateWI           -6.65e-01   2.19e-01   -3.03  0.00241 ** 
## StateWV           -6.43e-01   2.49e-01   -2.58  0.00981 ** 
## StateWY           -1.57e-01   3.24e-01   -0.49  0.62763    
## BankStateAL       -6.92e-01   4.51e-01   -1.53  0.12537    
## BankStateAR        7.35e-01   4.61e-01    1.59  0.11075    
## BankStateAZ        7.22e-01   4.86e-01    1.48  0.13757    
## BankStateCA       -1.34e+00   4.48e-01   -2.99  0.00278 ** 
## BankStateCO        2.50e-01   4.72e-01    0.53  0.59686    
## BankStateCT       -4.87e-01   4.56e-01   -1.07  0.28524    
## BankStateDC       -1.44e+00   5.37e-01   -2.67  0.00751 ** 
## BankStateDE       -3.23e-01   4.49e-01   -0.72  0.47174    
## BankStateFL       -1.29e+00   4.50e-01   -2.86  0.00419 ** 
## BankStateGA        5.93e-01   4.53e-01    1.31  0.19028    
## BankStateHI        1.47e+00   4.95e-01    2.96  0.00306 ** 
## BankStateIA        1.62e-01   4.67e-01    0.35  0.72880    
## BankStateID       -2.66e-01   4.63e-01   -0.57  0.56554    
## BankStateIL       -8.47e-01   4.48e-01   -1.89  0.05881 .  
## BankStateIN        9.96e-01   4.58e-01    2.17  0.02966 *  
## BankStateKS        1.01e+00   4.67e-01    2.17  0.03018 *  
## BankStateKY        6.63e-01   4.68e-01    1.42  0.15651    
## BankStateLA        6.93e-01   4.73e-01    1.47  0.14245    
## BankStateMA        4.99e-01   4.53e-01    1.10  0.27156    
## BankStateMD        9.35e-01   4.61e-01    2.03  0.04238 *  
## BankStateME        8.18e-01   4.78e-01    1.71  0.08707 .  
## BankStateMI        7.82e-01   4.59e-01    1.70  0.08886 .  
## BankStateMN        5.35e-01   4.53e-01    1.18  0.23808    
## BankStateMO        2.49e-01   4.54e-01    0.55  0.58327    
## BankStateMS        1.93e+00   4.66e-01    4.14  3.5e-05 ***
## BankStateMT        1.13e+00   4.83e-01    2.33  0.01974 *  
## BankStateNC       -7.94e-01   4.48e-01   -1.77  0.07625 .  
## BankStateND        4.53e-01   4.91e-01    0.92  0.35626    
## BankStateNE        5.97e-01   4.76e-01    1.25  0.20970    
## BankStateNH        1.78e-01   4.62e-01    0.39  0.69971    
## BankStateNJ        9.83e-02   4.56e-01    0.22  0.82944    
## BankStateNM        4.58e-01   4.73e-01    0.97  0.33273    
## BankStateNV       -4.62e-03   4.67e-01   -0.01  0.99211    
## BankStateNY       -1.68e-01   4.49e-01   -0.37  0.70840    
## BankStateOH       -2.87e-01   4.48e-01   -0.64  0.52192    
## BankStateOK        4.99e-01   4.64e-01    1.08  0.28151    
## BankStateOR       -1.11e+00   4.50e-01   -2.47  0.01359 *  
## BankStatePA        7.91e-01   4.52e-01    1.75  0.08012 .  
## BankStatePR       -3.55e+00   1.42e+00   -2.49  0.01280 *  
## BankStateRI       -5.85e-01   4.48e-01   -1.31  0.19175    
## BankStateSC       -2.94e+00   4.68e-01   -6.27  3.6e-10 ***
## BankStateSD       -9.31e-01   4.48e-01   -2.08  0.03762 *  
## BankStateTN       -2.89e-01   4.65e-01   -0.62  0.53376    
## BankStateTX       -4.71e-01   4.51e-01   -1.05  0.29545    
## BankStateUT       -4.20e-01   4.51e-01   -0.93  0.35188    
## BankStateVA       -1.51e+00   4.48e-01   -3.36  0.00077 ***
## BankStateVT        1.28e+00   4.83e-01    2.65  0.00815 ** 
## BankStateWA       -5.98e-02   4.57e-01   -0.13  0.89584    
## BankStateWI       -1.24e-01   4.53e-01   -0.27  0.78472    
## BankStateWV       -5.89e-01   5.10e-01   -1.15  0.24819    
## BankStateWY        9.50e-01   5.89e-01    1.61  0.10656    
## Term               4.21e-02   2.20e-04  191.25  < 2e-16 ***
## FranchiseCode      8.51e-01   1.18e-02   72.44  < 2e-16 ***
## DisbursementGross  3.37e-07   2.95e-08   11.42  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 291335  on 251489  degrees of freedom
## Residual deviance: 212432  on 251385  degrees of freedom
## AIC: 212642
## 
## Number of Fisher Scoring iterations: 6
best_score
## [1] 0.8321
best_table
##         
## glm.pred CHGOFF P I F
##   CHGOFF  14085  4615
##   P I F   11597 66262
total_performance/10
## [1] 0.8313