loan_Df <- read.csv("C:/Users/PC/Documents/R_4DS/bank_loan.csv", na.strings = c("", " ", NA))
glimpse(loan_Df)
Rows: 614
Columns: 13
$ Loan_ID [3m[38;5;246m<fct>[39m[23m LP001002, LP001003, LP001005, LP001006, LP001008, LP001011, LP001013, LP001014, LP001018, LP001020, LP0010~
$ Gender [3m[38;5;246m<fct>[39m[23m Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Male, Fema~
$ Married [3m[38;5;246m<fct>[39m[23m No, Yes, Yes, Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, No, Yes, No, No, No, Yes, Yes, Yes, Yes, Ye~
$ Dependents [3m[38;5;246m<int>[39m[23m 0, 1, 0, 0, 0, 2, 0, 3, 2, 1, 2, 2, 2, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 0, 0, 2, 0, 2, 1, 0, 1, 0, 3, 0~
$ Education [3m[38;5;246m<fct>[39m[23m Graduate, Graduate, Graduate, Not Graduate, Graduate, Graduate, Not Graduate, Graduate, Graduate, Graduate~
$ Self_Employed [3m[38;5;246m<fct>[39m[23m No, No, Yes, No, No, Yes, No, No, No, No, No, NA, No, No, No, No, No, No, No, NA, No, No, No, No, NA, Yes,~
$ ApplicantIncome [3m[38;5;246m<int>[39m[23m 5849, 4583, 3000, 2583, 6000, 5417, 2333, 3036, 4006, 12841, 3200, 2500, 3073, 1853, 1299, 4950, 3596, 351~
$ CoapplicantIncome [3m[38;5;246m<dbl>[39m[23m 0, 1508, 0, 2358, 0, 4196, 1516, 2504, 1526, 10968, 700, 1840, 8106, 2840, 1086, 0, 0, 0, 0, 3500, 0, 5625~
$ LoanAmount [3m[38;5;246m<int>[39m[23m NA, 128, 66, 120, 141, 267, 95, 158, 168, 349, 70, 109, 200, 114, 17, 125, 100, 76, 133, 115, 104, 315, 11~
$ Loan_Amount_Term [3m[38;5;246m<int>[39m[23m 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 360, 120, 360, 240, 360, 360, NA, 360, 36~
$ Credit_History [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, NA, 0, 1, 1, 0, 1, 0, 0, NA, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1~
$ Property_Area [3m[38;5;246m<fct>[39m[23m Urban, Rural, Urban, Urban, Urban, Urban, Urban, Semiurban, Urban, Semiurban, Urban, Urban, Urban, Rural, ~
$ Loan_Status [3m[38;5;246m<fct>[39m[23m Y, N, Y, Y, Y, Y, Y, N, Y, N, Y, Y, Y, N, Y, Y, Y, N, N, Y, N, Y, N, N, N, Y, Y, Y, N, Y, N, N, N, Y, N, Y~
Glimpse of dataset indicates an approach to Supervised Learning, a couple of explanatory variablesinfluencing whether or not a Loan is availed, a binarized outcome implying a Logistic Regression approach to understanding these underlying/obvious variable relationship.
## Check Missing Values
null_vars <- (sapply(loan_Df, function(x) sum(is.na(x))))
t(data.frame(null_vars))
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
null_vars 0 13 3 15 0 32 0 0 22 14
Credit_History Property_Area Loan_Status
null_vars 50 0 0
## Blank Rows
blank_rows <- (sapply(loan_Df, function(x) sum(x == "")))
t(data.frame(blank_rows))
Loan_ID Gender Married Dependents Education Self_Employed ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term
blank_rows 0 NA NA NA 0 NA 0 0 NA NA
Credit_History Property_Area Loan_Status
blank_rows NA 0 0
loan_Df <- loan_Df %>%
mutate(Credit_History = replace_na(Credit_History, mean(Credit_History, na.rm = T))) %>%
mutate(LoanAmount = replace_na(LoanAmount, mean(LoanAmount, na.rm = T))) %>%
mutate(Dependents = replace_na(Dependents, mean(Dependents, na.rm = T))) %>%
mutate(Loan_Amount_Term = replace_na(Loan_Amount_Term, min(Dependents, na.rm = T))) %>%
fill(Gender, .direction = 'downup') %>%
fill(Self_Employed, .direction = 'updown') %>%
filter(!Married == "") %>%
mutate(Credit_History = ifelse(Credit_History < 1, 0, Credit_History)) %>%
mutate(Loan_Amount_Term = case_when(Loan_Amount_Term <= 120 ~ "Short_Termed",
Loan_Amount_Term <= 300 ~ "Mid_Termed",
Loan_Amount_Term > 300 ~ "Long_Termed")) %>%
mutate(Loan_Amount_Term = factor(Loan_Amount_Term, levels = c("Short_Termed", "Mid_Termed", "Long_Termed"))) %>%
select(-c(Loan_ID))
## Distrubtution of the Response Variable
loan_Df %>%
ggplot(aes(x = Loan_Status))+
geom_bar()
Over response data is imbalanced.
library(mltools)
ml_df <- data.table(loan_Df[-12])
ml_df_encoded <- one_hot(ml_df)
ml_df_encoded <- cbind(ml_df_encoded, loan_Df[12])
quick_sample(ml_df_encoded)
### Train-Test
n_train <- round(0.8*nrow(ml_df_encoded))
train_indices <- sample(1:nrow(ml_df_encoded), n_train)
df_train <- ml_df_encoded[train_indices,]
df_test <- ml_df_encoded[-train_indices,]
### Logistic
logi_model <- glm(formula = Loan_Status ~ Gender_Female + Gender_Male +
Married_Yes + Dependents + Education_Graduate +
Self_Employed_No + ApplicantIncome + CoapplicantIncome +
LoanAmount + Loan_Amount_Term_Short_Termed +
Loan_Amount_Term_Mid_Termed + Loan_Amount_Term_Long_Termed +
Credit_History + Property_Area_Rural +
Property_Area_Semiurban + Property_Area_Urban, family = "binomial", data = df_train)
#disable scientific notation for model summary
# options(scipen=999)
#view model summary
summary(logi_model)
Call:
glm(formula = Loan_Status ~ Gender_Female + Gender_Male + Married_Yes +
Dependents + Education_Graduate + Self_Employed_No + ApplicantIncome +
CoapplicantIncome + LoanAmount + Loan_Amount_Term_Short_Termed +
Loan_Amount_Term_Mid_Termed + Loan_Amount_Term_Long_Termed +
Credit_History + Property_Area_Rural + Property_Area_Semiurban +
Property_Area_Urban, family = "binomial", data = df_train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2188 -0.7644 0.5659 0.7215 1.8959
Coefficients: (3 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.29129852 0.54591519 -2.365 0.0180 *
Gender_Female 0.02669178 0.30437075 0.088 0.9301
Gender_Male NA NA NA NA
Married_Yes 0.54278238 0.26642078 2.037 0.0416 *
Dependents -0.07398764 0.12140384 -0.609 0.5422
Education_Graduate 0.32381124 0.27754538 1.167 0.2433
Self_Employed_No -0.08376748 0.33032673 -0.254 0.7998
ApplicantIncome 0.00001498 0.00003862 0.388 0.6980
CoapplicantIncome 0.00002115 0.00004935 0.429 0.6683
LoanAmount -0.00146255 0.00194877 -0.750 0.4530
Loan_Amount_Term_Short_Termed 0.25294458 0.60401773 0.419 0.6754
Loan_Amount_Term_Mid_Termed -0.47711121 0.38851245 -1.228 0.2194
Loan_Amount_Term_Long_Termed NA NA NA NA
Credit_History 2.26711080 0.25700169 8.821 <0.0000000000000002 ***
Property_Area_Rural -0.36904831 0.27358706 -1.349 0.1774
Property_Area_Semiurban 0.56077705 0.28365155 1.977 0.0480 *
Property_Area_Urban NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 609.28 on 488 degrees of freedom
Residual deviance: 497.25 on 475 degrees of freedom
AIC: 525.25
Number of Fisher Scoring iterations: 4
library(pscl)
Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis
pR2(logi_model)["McFadden"]
fitting null model for pseudo-r2
McFadden
0.1838804
A value of 0.1838804 is low for McFadden’s R2, which indicates that our model does not fit the data very well and has low predictive power.
## VIP Variables
library(caret)
vip_vars <- data.frame(varImp(logi_model))
vip_vars %>%
arrange(-Overall)
library(InformationValue)
Attaching package: 㤼㸱InformationValue㤼㸲
The following objects are masked from 㤼㸱package:caret㤼㸲:
confusionMatrix, precision, sensitivity, specificity
## Make Predictions
predicted <- predict(logi_model, df_test, type="response")
prediction from a rank-deficient fit may be misleading
optimal <- optimalCutoff(df_test$Loan_Status, predicted)[1]
optimal
[1] 0.1185697
#+> prediction from a rank-deficient fit may be misleading This tells us that the optimal probability cutoff to use is 0.1185697. Thus, any individual with a probability of defaulting of 0.1185697 or higher will be predicted to get the Loan, while any individual with a probability less than this number will be predicted to not be availed a Loan.
## Checking
confusionMatrix(df_test$Loan_Status, predicted)
#calculate sensitivity
sensitivity(df_test$Loan_Status, predicted)
[1] NaN
#calculate specificity
specificity(df_test$Loan_Status, predicted)
[1] 0.2540984
#calculate total misclassification error rate
misClassError(df_test$Loan_Status, predicted, threshold=optimal)
[1] 1