Should This Loan be Approved or Denied?
In this report, we explore attributes of SBAs loan data from a variety of small business loan records. The sba has offered a collection of data on these small businesses and their loan applications. The businesses in the dataset represent a variety of industries locations, sizes, and other demographic attributes. There are over 800,000 businesses with 27 varables collected over a span of several decades.
We are interested in a few problems and questions besides what have been explored in the class. What are the most important five features in predicting default rate. Is KNN better or Logistic regression better when predicting default rate using the five most important features.
##load Original Data
library(readr)
SBAnational <- read_csv("~/Desktop/SBAnational.csv", col_types = cols(BalanceGross = col_number(), DisbursementGross = col_number(), GrAppv = col_number(), SBA_Appv = col_number()))
#Drop properties with too much distinct values or unrelated
drops <- c("LoanNr_ChkDgt", "Name","City","Zip","NAICS", "ApprovalDate", "ApprovalFY","Bank", "ChgOffDate","DisbursementDate","ChgOffPrinGr")
Ourdata = SBAnational[ , !(names(SBAnational) %in% drops)]
# use sql to preprocess data with valid input
library(sqldf)
Ourdata = sqldf('select * from Ourdata where (MIS_Status = "P I F" or MIS_Status = "CHGOFF") and (NewExist = 1 or NewExist = 2) and (FranchiseCode = 0 or FranchiseCode = 1) and (UrbanRural = 1 or UrbanRural = 2) and (RevLineCr = "Y" or RevLineCr = "N") and (LowDoc = "Y" or LowDoc = "N") and (BankState <> "AN")')
#clean all NUlls
Ourdata = na.omit(Ourdata)
#change char data input to factor
Ourdata$MIS_Status = factor(Ourdata$MIS_Status)
Ourdata$State = factor(Ourdata$State)
Ourdata$BankState = factor(Ourdata$BankState)
Ourdata$RevLineCr = factor(Ourdata$RevLineCr)
Ourdata$LowDoc = factor(Ourdata$LowDoc)
summary(Ourdata)
## State BankState Term NoEmp
## CA : 47581 NC : 52293 Min. : 0.0 Min. : 0
## NY : 28483 IL : 36214 1st Qu.: 49.0 1st Qu.: 2
## TX : 21227 CA : 34245 Median : 84.0 Median : 3
## FL : 19707 RI : 32949 Mean : 81.7 Mean : 8
## PA : 15594 OH : 29566 3rd Qu.: 84.0 3rd Qu.: 8
## OH : 14999 VA : 17980 Max. :527.0 Max. :8000
## (Other):200458 (Other):144802
## NewExist CreateJob RetainedJob FranchiseCode UrbanRural
## Min. :1.00 Min. : 0 Min. : 0 Min. :0.000 Min. :1.00
## 1st Qu.:1.00 1st Qu.: 0 1st Qu.: 1 1st Qu.:0.000 1st Qu.:1.00
## Median :1.00 Median : 0 Median : 2 Median :0.000 Median :1.00
## Mean :1.26 Mean : 2 Mean : 6 Mean :0.499 Mean :1.18
## 3rd Qu.:2.00 3rd Qu.: 2 3rd Qu.: 6 3rd Qu.:1.000 3rd Qu.:1.00
## Max. :2.00 Max. :5085 Max. :4441 Max. :1.000 Max. :2.00
##
## RevLineCr LowDoc DisbursementGross BalanceGross MIS_Status
## N:159017 N:346622 Min. : 4000 Min. : 0 CHGOFF: 92573
## Y:189032 Y: 1427 1st Qu.: 27000 1st Qu.: 0 P I F :255476
## Median : 61000 Median : 0
## Mean : 157439 Mean : 6
## 3rd Qu.: 152899 3rd Qu.: 0
## Max. :11446325 Max. :996262
##
## GrAppv SBA_Appv
## Min. : 1000 Min. : 500
## 1st Qu.: 25000 1st Qu.: 12500
## Median : 50000 Median : 25000
## Mean : 133651 Mean : 97315
## 3rd Qu.: 100100 3rd Qu.: 72056
## Max. :5000000 Max. :4500000
##
#run randomforest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
train <- sample(nrow(Ourdata), 0.7*nrow(Ourdata), replace = FALSE)
TrainSet <- Ourdata[train,]
ValidSet <- Ourdata[-train,]
model <- randomForest(MIS_Status ~ ., data = TrainSet, ntree = 100, mtry = 5, importance = TRUE)
model
##
## Call:
## randomForest(formula = MIS_Status ~ ., data = TrainSet, ntree = 100, mtry = 5, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 5.66%
## Confusion matrix:
## CHGOFF P I F class.error
## CHGOFF 56232 8589 0.13250
## P I F 5191 173622 0.02903
# Predicting on Validation set
predValid <- predict(model, ValidSet, type = "class")
# Checking classification accuracy
accuracy = mean(predValid == ValidSet$MIS_Status)
#confusion matrix
table(predValid,ValidSet$MIS_Status)
##
## predValid CHGOFF P I F
## CHGOFF 24123 2208
## P I F 3629 74455
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
varImpPlot(model)
The accurqcy of our model is 0.9441.
features = c("State","BankState","Term","FranchiseCode","DisbursementGross","MIS_Status")
temp = Ourdata[ , (names(Ourdata) %in% features)]
Ourdata = temp
total_performance = 0
best_logistic = 0
best_score = 0
best_table = 0
for (i in 1:10){
index_1 = createDataPartition(Ourdata$State, p = 0.85, list = F)
index_2 = createDataPartition(Ourdata$BankState, p = 0.85, list = F)
train = intersect(index_1, index_2)
TrainSet <- Ourdata[train,]
ValidSet <- Ourdata[-train,]
glm.fit <- glm(MIS_Status ~ State + BankState + Term + FranchiseCode + DisbursementGross, data = TrainSet, family = binomial)
#levels(ValidSet$MIS_Status)
glm.probs <- predict(glm.fit, newdata = ValidSet, type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "P I F", "CHGOFF")
cur_score = mean(glm.pred == ValidSet$MIS_Status)
total_performance = total_performance + cur_score
if (cur_score > best_score){
best_score = cur_score
best_logistic = glm.fit
best_table = table(glm.pred, ValidSet$MIS_Status)
}
}
summary(best_logistic)
##
## Call:
## glm(formula = MIS_Status ~ State + BankState + Term + FranchiseCode +
## DisbursementGross, family = binomial, data = TrainSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.366 -0.596 0.388 0.659 2.884
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.87e-01 3.96e-01 -1.23 0.21956
## StateAL -1.23e+00 2.24e-01 -5.51 3.5e-08 ***
## StateAR -1.71e+00 2.34e-01 -7.29 3.1e-13 ***
## StateAZ -1.30e+00 2.15e-01 -6.05 1.5e-09 ***
## StateCA -1.15e+00 2.12e-01 -5.40 6.6e-08 ***
## StateCO -1.22e+00 2.15e-01 -5.68 1.3e-08 ***
## StateCT -3.50e-01 2.19e-01 -1.60 0.10947
## StateDC -8.44e-01 2.47e-01 -3.42 0.00062 ***
## StateDE -1.05e+00 2.38e-01 -4.40 1.1e-05 ***
## StateFL -1.47e+00 2.13e-01 -6.90 5.1e-12 ***
## StateGA -1.67e+00 2.15e-01 -7.77 8.0e-15 ***
## StateHI -1.62e+00 2.70e-01 -6.03 1.7e-09 ***
## StateIA -7.11e-01 2.36e-01 -3.01 0.00260 **
## StateID -9.26e-01 2.22e-01 -4.18 2.9e-05 ***
## StateIL -1.07e+00 2.14e-01 -5.00 5.7e-07 ***
## StateIN -9.15e-01 2.17e-01 -4.22 2.5e-05 ***
## StateKS -1.05e+00 2.29e-01 -4.59 4.4e-06 ***
## StateKY -1.01e+00 2.22e-01 -4.55 5.5e-06 ***
## StateLA -1.29e+00 2.23e-01 -5.78 7.7e-09 ***
## StateMA -5.55e-01 2.16e-01 -2.58 0.01000 *
## StateMD -9.56e-01 2.17e-01 -4.40 1.1e-05 ***
## StateME -2.08e-01 2.34e-01 -0.89 0.37412
## StateMI -1.11e+00 2.15e-01 -5.17 2.4e-07 ***
## StateMN -9.05e-01 2.17e-01 -4.16 3.2e-05 ***
## StateMO -1.13e+00 2.18e-01 -5.17 2.3e-07 ***
## StateMS -1.87e+00 2.33e-01 -8.02 1.1e-15 ***
## StateMT -3.54e-01 2.59e-01 -1.37 0.17202
## StateNC -7.29e-01 2.17e-01 -3.36 0.00077 ***
## StateND 1.20e-01 2.91e-01 0.41 0.68082
## StateNE -6.97e-01 2.46e-01 -2.83 0.00459 **
## StateNH -5.97e-01 2.20e-01 -2.71 0.00666 **
## StateNJ -9.22e-01 2.15e-01 -4.30 1.7e-05 ***
## StateNM -5.61e-01 2.35e-01 -2.38 0.01716 *
## StateNV -1.71e+00 2.19e-01 -7.80 6.1e-15 ***
## StateNY -8.15e-01 2.13e-01 -3.82 0.00013 ***
## StateOH -7.64e-01 2.14e-01 -3.57 0.00036 ***
## StateOK -1.20e+00 2.26e-01 -5.31 1.1e-07 ***
## StateOR -6.73e-01 2.17e-01 -3.10 0.00195 **
## StatePA -7.57e-01 2.15e-01 -3.52 0.00042 ***
## StateRI -3.41e-01 2.20e-01 -1.55 0.12192
## StateSC -8.53e-01 2.24e-01 -3.80 0.00014 ***
## StateSD 2.30e-01 2.52e-01 0.92 0.35981
## StateTN -1.49e+00 2.20e-01 -6.77 1.2e-11 ***
## StateTX -8.24e-01 2.13e-01 -3.86 0.00011 ***
## StateUT -9.77e-01 2.18e-01 -4.48 7.6e-06 ***
## StateVA -6.47e-01 2.17e-01 -2.98 0.00284 **
## StateVT -1.47e-01 2.47e-01 -0.60 0.55140
## StateWA -9.43e-01 2.15e-01 -4.38 1.2e-05 ***
## StateWI -6.65e-01 2.19e-01 -3.03 0.00241 **
## StateWV -6.43e-01 2.49e-01 -2.58 0.00981 **
## StateWY -1.57e-01 3.24e-01 -0.49 0.62763
## BankStateAL -6.92e-01 4.51e-01 -1.53 0.12537
## BankStateAR 7.35e-01 4.61e-01 1.59 0.11075
## BankStateAZ 7.22e-01 4.86e-01 1.48 0.13757
## BankStateCA -1.34e+00 4.48e-01 -2.99 0.00278 **
## BankStateCO 2.50e-01 4.72e-01 0.53 0.59686
## BankStateCT -4.87e-01 4.56e-01 -1.07 0.28524
## BankStateDC -1.44e+00 5.37e-01 -2.67 0.00751 **
## BankStateDE -3.23e-01 4.49e-01 -0.72 0.47174
## BankStateFL -1.29e+00 4.50e-01 -2.86 0.00419 **
## BankStateGA 5.93e-01 4.53e-01 1.31 0.19028
## BankStateHI 1.47e+00 4.95e-01 2.96 0.00306 **
## BankStateIA 1.62e-01 4.67e-01 0.35 0.72880
## BankStateID -2.66e-01 4.63e-01 -0.57 0.56554
## BankStateIL -8.47e-01 4.48e-01 -1.89 0.05881 .
## BankStateIN 9.96e-01 4.58e-01 2.17 0.02966 *
## BankStateKS 1.01e+00 4.67e-01 2.17 0.03018 *
## BankStateKY 6.63e-01 4.68e-01 1.42 0.15651
## BankStateLA 6.93e-01 4.73e-01 1.47 0.14245
## BankStateMA 4.99e-01 4.53e-01 1.10 0.27156
## BankStateMD 9.35e-01 4.61e-01 2.03 0.04238 *
## BankStateME 8.18e-01 4.78e-01 1.71 0.08707 .
## BankStateMI 7.82e-01 4.59e-01 1.70 0.08886 .
## BankStateMN 5.35e-01 4.53e-01 1.18 0.23808
## BankStateMO 2.49e-01 4.54e-01 0.55 0.58327
## BankStateMS 1.93e+00 4.66e-01 4.14 3.5e-05 ***
## BankStateMT 1.13e+00 4.83e-01 2.33 0.01974 *
## BankStateNC -7.94e-01 4.48e-01 -1.77 0.07625 .
## BankStateND 4.53e-01 4.91e-01 0.92 0.35626
## BankStateNE 5.97e-01 4.76e-01 1.25 0.20970
## BankStateNH 1.78e-01 4.62e-01 0.39 0.69971
## BankStateNJ 9.83e-02 4.56e-01 0.22 0.82944
## BankStateNM 4.58e-01 4.73e-01 0.97 0.33273
## BankStateNV -4.62e-03 4.67e-01 -0.01 0.99211
## BankStateNY -1.68e-01 4.49e-01 -0.37 0.70840
## BankStateOH -2.87e-01 4.48e-01 -0.64 0.52192
## BankStateOK 4.99e-01 4.64e-01 1.08 0.28151
## BankStateOR -1.11e+00 4.50e-01 -2.47 0.01359 *
## BankStatePA 7.91e-01 4.52e-01 1.75 0.08012 .
## BankStatePR -3.55e+00 1.42e+00 -2.49 0.01280 *
## BankStateRI -5.85e-01 4.48e-01 -1.31 0.19175
## BankStateSC -2.94e+00 4.68e-01 -6.27 3.6e-10 ***
## BankStateSD -9.31e-01 4.48e-01 -2.08 0.03762 *
## BankStateTN -2.89e-01 4.65e-01 -0.62 0.53376
## BankStateTX -4.71e-01 4.51e-01 -1.05 0.29545
## BankStateUT -4.20e-01 4.51e-01 -0.93 0.35188
## BankStateVA -1.51e+00 4.48e-01 -3.36 0.00077 ***
## BankStateVT 1.28e+00 4.83e-01 2.65 0.00815 **
## BankStateWA -5.98e-02 4.57e-01 -0.13 0.89584
## BankStateWI -1.24e-01 4.53e-01 -0.27 0.78472
## BankStateWV -5.89e-01 5.10e-01 -1.15 0.24819
## BankStateWY 9.50e-01 5.89e-01 1.61 0.10656
## Term 4.21e-02 2.20e-04 191.25 < 2e-16 ***
## FranchiseCode 8.51e-01 1.18e-02 72.44 < 2e-16 ***
## DisbursementGross 3.37e-07 2.95e-08 11.42 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 291335 on 251489 degrees of freedom
## Residual deviance: 212432 on 251385 degrees of freedom
## AIC: 212642
##
## Number of Fisher Scoring iterations: 6
best_score
## [1] 0.8321
best_table
##
## glm.pred CHGOFF P I F
## CHGOFF 14085 4615
## P I F 11597 66262
total_performance/10
## [1] 0.8313