Objective

In the project, I will explore the dataset for loan approval. I will create various models to predict the loan approvals. In the end, I will test the performance of each model based on the accuracy of the prediction

Data Exploration

Load the required libraries

Load Data

# Load Data
Loan_approval =  read.csv("C:/Users/patel/Downloads/Loan_approval.csv", header=T, na.strings=c("","NA"))

The loan approval status data dictionary is as below

VARIABLE DESCRIPTION
Loan_ID Unique Loan ID
Gender Male/ Female
Married Applicant married (Y/N)
Dependents Number of dependents
Education Applicant Education (Graduate/ Undergraduate)
Self_Employed Self employed (Y/N)
ApplicantIncome Applicant income
CoapplicantIncome Coapplicant income
LoanAmount Loan amount in thousands
Loan_Amount_Term Term of loan in months
Credit_History credit history meets guidelines
Property_Area Urban/ Semi Urban/ Rural
Loan_Status Loan approved (Y/N)

Data Summary

#dim
dim(Loan_approval)
## [1] 614  13

There are 614 observations of 13 variables.

Frequency Distributions

This function lets us compare the distribution of a target variable vs another variable. The variables can be categorical or continuous.

For categorical features

##To visualize distributions for all categorical features:
par(mfrow=c(3,3))

barplot(table(Loan_approval$Loan_Status, Loan_approval$Gender), main="Loan Status by Gender",
        xlab="Gender", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Married), main="Loan Status by Married",
        xlab="Married", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Dependents), main="Loan Status by Dependents",
        xlab="Dependents", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Education), main="Loan Status by Education",
        xlab="Education", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Credit_History), main="Loan Status by Credit_History",
        xlab="Credit_History", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Self_Employed), main="Loan Status by Self Employed", 
        xlab="Self_Employed", legend = TRUE)

barplot(table(Loan_approval$Loan_Status, Loan_approval$Property_Area)
, main="Loan Status by Property_Area",
        xlab="Property_Area", legend = TRUE)

continuous features

#To visualize distributions for all continuous features:
plot_histogram(Loan_approval)

Data Cleaning

#remove loan_id
Loan_approval <- subset(Loan_approval, select = -Loan_ID )

##mutate as factors for categorical data

Loan_approval <- Loan_approval %>%
           mutate(Gender = factor(Gender),
                  Married = factor(Married),
                  Dependents = factor(Dependents),
                  Education = factor(Education),
                  Self_Employed = factor(Self_Employed),
                  Property_Area = factor(Property_Area),
                  Loan_Status = factor(Loan_Status),
                  Credit_History= factor(Credit_History))

summary(Loan_approval)
##     Gender    Married    Dependents        Education   Self_Employed
##  Female:112   No  :213   0   :345   Graduate    :480   No  :500     
##  Male  :489   Yes :398   1   :102   Not Graduate:134   Yes : 82     
##  NA's  : 13   NA's:  3   2   :101                      NA's: 32     
##                          3+  : 51                                   
##                          NA's: 15                                   
##                                                                     
##                                                                     
##  ApplicantIncome CoapplicantIncome   LoanAmount    Loan_Amount_Term
##  Min.   :  150   Min.   :    0     Min.   :  9.0   Min.   : 12     
##  1st Qu.: 2878   1st Qu.:    0     1st Qu.:100.0   1st Qu.:360     
##  Median : 3812   Median : 1188     Median :128.0   Median :360     
##  Mean   : 5403   Mean   : 1621     Mean   :146.4   Mean   :342     
##  3rd Qu.: 5795   3rd Qu.: 2297     3rd Qu.:168.0   3rd Qu.:360     
##  Max.   :81000   Max.   :41667     Max.   :700.0   Max.   :480     
##                                    NA's   :22      NA's   :14      
##  Credit_History   Property_Area Loan_Status
##  0   : 89       Rural    :179   N:192      
##  1   :475       Semiurban:233   Y:422      
##  NA's: 50       Urban    :202              
##                                            
##                                            
##                                            
## 

I subset the load_id from the dataset and convert categorical data as factor.

Missing values table

#Checking the Missing data proportion
plot_missing(Loan_approval)

Handling Missing Values

From the missing value chart, I concluded that there isn’t any variance with missing values being more than 10 percent of the data. The dataset is almost complete just a few observations with missing values that can be omitted or impute. I will consider imputing the missing value with the missForest library.

LA_df<- missForest(Loan_approval)
##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!
##   missForest iteration 5 in progress...done!
##   missForest iteration 6 in progress...done!
Loan_approval_clean <- LA_df$ximp

plot_missing(Loan_approval_clean)

Splitting the data 70-30

set.seed(17)
# splitting the data into 70-30

df1_split=split_train_test(Loan_approval_clean,outcome=Loan_Status,0.7)

#display train
(head(df1_split$train,5))
##   Gender Married Dependents Education Self_Employed ApplicantIncome
## 1   Male      No          0  Graduate            No            5849
## 2   Male     Yes          1  Graduate            No            4583
## 3   Male     Yes          0  Graduate           Yes            3000
## 5   Male      No          0  Graduate            No            6000
## 6   Male     Yes          2  Graduate           Yes            5417
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1                 0   150.1979              360              1         Urban
## 2              1508   128.0000              360              1         Rural
## 3                 0    66.0000              360              1         Urban
## 5                 0   141.0000              360              1         Urban
## 6              4196   267.0000              360              1         Urban
##   Loan_Status
## 1           Y
## 2           N
## 3           Y
## 5           Y
## 6           Y

Linear Discriminant Analysis

Selection of the variable

I drop the categorical variables like Gender, Married, Dependents, Education, Self_Employed, Credit_History, Property_Area since Linear Discriminant Analysis (LDA) needs continuous variables to feed into the model.

# remove categorical values
La_categ<- subset(df1_split$train, select = -c(Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area ))

linearly separable or nor?

I will feature plot to see is there any linearly separable or nor?

library(AppliedPredictiveModeling)
transparentTheme(trans = .4)

featurePlot(x = La_categ[,1:4], 
            y = La_categ$Loan_Status, 
            plot = "ellipse",
            ## Add a key at the top
            auto.key = list(columns = 3))
## Warning in draw.key(simpleKey(...), draw = FALSE): not enough rows for columns

The plot suggests that it is not linearly separable. The different colors of eclipses in the scatter plot represent the loan approval status. Overlapping of eclipse suggests that it is not linearly separable. So Linear Discriminant Analysis Model would not be ideal for this dataset. However, I can still create an LDA model to verify how it performs with other models.

Build LDA Model

lda_rt_s<-Sys.time()
model_lda<- lda(Loan_Status ~. , data = La_categ)
lda_rt_e<-Sys.time()
lda_rt<- lda_rt_e-lda_rt_s

model_lda
## Call:
## lda(Loan_Status ~ ., data = La_categ)
## 
## Prior probabilities of groups:
##         N         Y 
## 0.3132251 0.6867749 
## 
## Group means:
##   ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
## N        5806.459          2079.622   156.1653         350.1582
## Y        5175.659          1467.662   142.7270         342.4789
## 
## Coefficients of linear discriminants:
##                             LD1
## ApplicantIncome   -5.565016e-05
## CoapplicantIncome -2.327229e-04
## LoanAmount        -2.736099e-03
## Loan_Amount_Term  -9.083394e-03

Prior probabilities of groups: the proportion of training observations in each group. For example, there are 69% of the training observations is loan Approved

Group means: group center of gravity. Shows the mean of each variable in each group.

Coefficients of linear discriminant: Shows the linear combination of predictor variables that are used to form the LDA decision rule

La_categ_test<- subset(df1_split$test, select = -c(Gender,Married,Dependents,Education,Self_Employed,Credit_History,Property_Area ))


predict_lda_test <- predict(model_lda, La_categ_test)

cm_lda <- confusionMatrix( predict_lda_test$class, La_categ_test$Loan_Status)


#confusionMatrix
fourfoldplot(cm_lda$table, color = c("#CC6666", "#99CC99"),
             conf.level = 0, margin = 1, main = "Confusion Matrix")

K-nearest neighbor (KNN) algorithm

Preparation

Preprocessing is all about correcting the problems in data before building a machine learning model using that data. Problems can be of many types like missing values, attributes with a different range, etc.

prepro <- preProcess(x = df1_split$train, method = c("center", "scale"))
prepro
## Created from 431 samples and 12 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (8)
##   - scaled (4)

TrainControl() method. It controls the computational nuances of the train() method. I will use method "repeatedcv" for cross-validation

trControl <- trainControl(method="repeatedcv",number = 10, repeats = 5) 
start_time<-Sys.time()
model_knn <- train(Loan_Status ~ ., data = df1_split$train, 
                method = "knn", 
                trControl = trControl, 
                preProcess = c("center","scale"), 
                tuneLength = 20)
model_knn 
## k-Nearest Neighbors 
## 
## 431 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## Pre-processing: centered (14), scaled (14) 
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 388, 387, 388, 388, 388, 388, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.7943094  0.4453921
##    7  0.7892344  0.4215155
##    9  0.7951102  0.4323313
##   11  0.7904782  0.4151617
##   13  0.7891704  0.4093342
##   15  0.7854379  0.3941326
##   17  0.7817794  0.3816730
##   19  0.7841382  0.3863664
##   21  0.7855774  0.3924168
##   23  0.7888115  0.4029335
##   25  0.7850896  0.3907702
##   27  0.7827534  0.3824687
##   29  0.7798656  0.3732505
##   31  0.7756785  0.3591435
##   33  0.7728536  0.3485634
##   35  0.7695968  0.3366386
##   37  0.7658643  0.3230682
##   39  0.7612559  0.3068663
##   41  0.7566365  0.2878747
##   43  0.7533580  0.2750324
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
end_time<-Sys.time()
knn_rt<- end_time-start_time

Accuracy was used to select the optimal model using the largest value. The final value used for the model was k = 5.

plot(model_knn)

Predict from knn model

predict_knn_test <- predict(model_knn,newdata = df1_split$test)
mean(predict_knn_test == df1_split$test$Loan_Status) # accuracy
## [1] 0.7814208
cm_knn <- confusionMatrix(predict_knn_test, df1_split$test$Loan_Status)
cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   N   Y
##          N  22   5
##          Y  35 121
##                                          
##                Accuracy : 0.7814         
##                  95% CI : (0.7145, 0.839)
##     No Information Rate : 0.6885         
##     P-Value [Acc > NIR] : 0.003396       
##                                          
##                   Kappa : 0.4046         
##                                          
##  Mcnemar's Test P-Value : 4.533e-06      
##                                          
##             Sensitivity : 0.3860         
##             Specificity : 0.9603         
##          Pos Pred Value : 0.8148         
##          Neg Pred Value : 0.7756         
##              Prevalence : 0.3115         
##          Detection Rate : 0.1202         
##    Detection Prevalence : 0.1475         
##       Balanced Accuracy : 0.6731         
##                                          
##        'Positive' Class : N              
## 
fourfoldplot(cm_knn$table, color = c("#CC6666", "#99CC99"),
             conf.level = 0, margin = 1, main = "knn Confusion Matrix")

Decision Tree model

start_time<-Sys.time()
model_dt <- rpart(Loan_Status~ ., data=df1_split$train)

end_time<-Sys.time()
dt_rt<- end_time-start_time
rpart.plot(model_dt, nn=TRUE)

ctree_ <- ctree(Loan_Status~ ., data=df1_split$train)
plot(ctree_)

summary(model_dt)
## Call:
## rpart(formula = Loan_Status ~ ., data = df1_split$train)
##   n= 431 
## 
##          CP nsplit rel error    xerror       xstd
## 1 0.4296296      0 1.0000000 1.0000000 0.07132476
## 2 0.0100000      1 0.5703704 0.5703704 0.05890804
## 
## Variable importance
## Credit_History 
##            100 
## 
## Node number 1: 431 observations,    complexity param=0.4296296
##   predicted class=Y  expected loss=0.3132251  P(node) =1
##     class counts:   135   296
##    probabilities: 0.313 0.687 
##   left son=2 (68 obs) right son=3 (363 obs)
##   Primary splits:
##       Credit_History    splits as  LR,           improve=60.726510, (0 missing)
##       Property_Area     splits as  LRL,          improve= 6.303598, (0 missing)
##       Loan_Amount_Term  < 361.4718 to the right, improve= 4.654997, (0 missing)
##       LoanAmount        < 163      to the right, improve= 2.876391, (0 missing)
##       CoapplicantIncome < 8219.5   to the right, improve= 2.289073, (0 missing)
## 
## Node number 2: 68 observations
##   predicted class=N  expected loss=0.07352941  P(node) =0.1577726
##     class counts:    63     5
##    probabilities: 0.926 0.074 
## 
## Node number 3: 363 observations
##   predicted class=Y  expected loss=0.1983471  P(node) =0.8422274
##     class counts:    72   291
##    probabilities: 0.198 0.802
dtControl= rpart.control(minsplit = 20, xval = 81, cp=0.01)
predict_dt_test <- predict(model_dt, df1_split$test, 
                  type = "class",
                  control=dtControl)

cm_dt<- confusionMatrix(predict_dt_test, df1_split$test$Loan_Status)

fourfoldplot(cm_dt$table, color = c("#CC6666", "#99CC99"),
             conf.level = 0, margin = 1, main = "Decision Tree Confusion Matrix")

plotcp(model_dt)

Random Forest Model

Build Model

Rfcontrol <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
start_time<-Sys.time()
model_rf <- train(Loan_Status~., data = df1_split$train, method="rf")
end_time<-Sys.time()

rf_rt<- end_time-start_time

print(model_rf)
## Random Forest 
## 
## 431 samples
##  11 predictor
##   2 classes: 'N', 'Y' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 431, 431, 431, 431, 431, 431, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8099329  0.4996482
##    8    0.7822541  0.4530865
##   14    0.7721675  0.4368855
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
plot(model_rf)

Importance variable

rfImp <- varImp(model_rf, scale = FALSE)
plot(rfImp)

Top 5 Importance variable are Credit_History1, ApplicantIncome LoanAmount, CoapplicantIncome and Loan_Amount_Term.

# prediction from random forest model
predict_rf_test <- predict(model_rf, df1_split$test,type='raw')
mean(predict_rf_test == df1_split$test$Loan_Status) # accuracy
## [1] 0.8196721
cm_rf <- confusionMatrix(predict_rf_test, df1_split$test$Loan_Status)
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   N   Y
##          N  27   3
##          Y  30 123
##                                           
##                Accuracy : 0.8197          
##                  95% CI : (0.7562, 0.8725)
##     No Information Rate : 0.6885          
##     P-Value [Acc > NIR] : 4.295e-05       
##                                           
##                   Kappa : 0.5169          
##                                           
##  Mcnemar's Test P-Value : 6.011e-06       
##                                           
##             Sensitivity : 0.4737          
##             Specificity : 0.9762          
##          Pos Pred Value : 0.9000          
##          Neg Pred Value : 0.8039          
##              Prevalence : 0.3115          
##          Detection Rate : 0.1475          
##    Detection Prevalence : 0.1639          
##       Balanced Accuracy : 0.7249          
##                                           
##        'Positive' Class : N               
## 
fourfoldplot(cm_rf$table, color = c("#CC6666", "#99CC99"),
             conf.level = 0, margin = 1, main = "Decision Tree Confusion Matrix")

Model Performance

results<-as.data.frame(round(cm_lda$overall,4))
names(results)[1] <-"lda"
results$knn <- round(cm_knn$overall, 4)
results$decisiontree <- round(cm_dt$overall, 4)
results$randomforest <- round(cm_rf$overall, 4)


runtime<-rbind(c(lda_rt, knn_rt, dt_rt, rf_rt))
results<-data.frame(rbind(as.matrix(results), as.matrix(runtime)))
row.names(results)[8] <- "Runtime"

(results)
##                       lda      knn decisiontree randomforest
## Accuracy       0.68310000  0.78140   0.82510000      0.81970
## Kappa          0.00230000  0.40460   0.52900000      0.51690
## AccuracyLower  0.61030000  0.71450   0.76220000      0.75620
## AccuracyUpper  0.74970000  0.83900   0.87720000      0.87250
## AccuracyNull   0.68850000  0.68850   0.68850000      0.68850
## AccuracyPValue 0.59820000  0.00340   0.00000000      0.00000
## McnemarPValue  0.00000000  0.00000   0.00000000      0.00000
## Runtime        0.02108288 20.24391   0.04120398     56.01115

As results suggest that decision tree and random forest perform better than LDA and knn. both model Accuracy as 0.8251 and 0.8197 respectively The best performance or the model I pick is the decision tree algorithm because Accuracy of the model is better but also it is significantly faster than random forest algorithm. The runtime of a decision tree is 0.041204 and on the other hand 56.011148