data_621_blog

Logistic Regression

Logistic regression is a very common tool to solve classification problem. Given a binary outcome, we would like to classify whether an event would occur based on a set of quantitative or qualitative variables. In this blog post, we would like to use a public dataset to classify loan status based on various social demographic data.

# load packages
if(!require(pacman)){install.packages("pacman"); require(pacman)}

## Loading required package: pacman

## Warning: package 'pacman' was built under R version 3.6.2

packages <- c('tidyverse', 'caret', 'broom', 'Hmisc', 'kableExtra')
pacman::p_load(char = packages)

# read data
df <- read.csv(url('https://datahack-prod.s3.ap-south-1.amazonaws.com/train_file/train_u6lujuX_CVtuZ9i.csv'))

# check missing
colSums(is.na(df))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                22                14                50                 0 
##       Loan_Status 
##                 0

# impute dataset
preProcValues <- preProcess(df, method = c("medianImpute", "center", "scale"))
df_processed <- predict(preProcValues, df)

# check again
colSums(is.na(df_processed))
##           Loan_ID            Gender           Married        Dependents 
##                 0                 0                 0                 0 
##         Education     Self_Employed   ApplicantIncome CoapplicantIncome 
##                 0                 0                 0                 0 
##        LoanAmount  Loan_Amount_Term    Credit_History     Property_Area 
##                 0                 0                 0                 0 
##       Loan_Status 
##                 0

# check distribution
broom::glance(df_processed)
## Warning: 'glance.data.frame' is deprecated.
## See help("Deprecated")
## # A tibble: 1 x 4
##    nrow  ncol complete.obs na.fraction
##   <int> <int>        <int>       <dbl>
## 1   614    13          614           0
table(df_processed$Loan_Status)
## 
##   N   Y 
## 192 422
prop.table(ftable(df_processed$Loan_Status)) %>% round(., 1)
##    N   Y
##         
##  0.3 0.7

# head df
head(df_processed) %>% kable(.)

Loan_ID	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
LP001002	Male	No	0	Graduate	No	0.0729314	-0.5540356	-0.2151272	0.276411	0.4324768	Urban	Y
LP001003	Male	Yes	1	Graduate	No	-0.1343025	-0.0387000	-0.2151272	0.276411	0.4324768	Rural	N
LP001005	Male	Yes	0	Graduate	Yes	-0.3934266	-0.5540356	-0.9395335	0.276411	0.4324768	Urban	Y
LP001006	Male	Yes	0	Not Graduate	No	-0.4616860	0.2517743	-0.3085990	0.276411	0.4324768	Urban	Y
LP001008	Male	No	0	Graduate	No	0.0976488	-0.5540356	-0.0632356	0.276411	0.4324768	Urban	Y
LP001011	Male	Yes	2	Graduate	Yes	0.0022165	0.8798823	1.4089450	0.276411	0.4324768	Urban	Y


# str(), dim()
str(df_processed)
## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : Factor w/ 614 levels "LP001002","LP001003",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 3 3 3 3 3 3 3 ...
##  $ Married          : Factor w/ 3 levels "","No","Yes": 2 3 3 3 2 3 3 3 3 3 ...
##  $ Dependents       : Factor w/ 5 levels "","0","1","2",..: 2 3 2 2 2 4 2 5 4 3 ...
##  $ Education        : Factor w/ 2 levels "Graduate","Not Graduate": 1 1 1 2 1 1 2 1 1 1 ...
##  $ Self_Employed    : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 3 2 2 2 2 ...
##  $ ApplicantIncome  : num  0.0729 -0.1343 -0.3934 -0.4617 0.0976 ...
##  $ CoapplicantIncome: num  -0.554 -0.0387 -0.554 0.2518 -0.554 ...
##  $ LoanAmount       : num  -0.2151 -0.2151 -0.9395 -0.3086 -0.0632 ...
##  $ Loan_Amount_Term : num  0.276 0.276 0.276 0.276 0.276 ...
##  $ Credit_History   : num  0.432 0.432 0.432 0.432 0.432 ...
##  $ Property_Area    : Factor w/ 3 levels "Rural","Semiurban",..: 3 1 3 3 3 3 3 2 3 2 ...
##  $ Loan_Status      : Factor w/ 2 levels "N","Y": 2 1 2 2 2 2 2 1 2 1 ...
dim(df_processed)
## [1] 614  13

# make changes to the "Loan_Status" level, label
levels(df_processed$Loan_Status)
## [1] "N" "Y"
df_processed <- df_processed %>% dplyr::mutate(Loan_Status = Loan_Status %>% 
                                                       factor(., levels = c("Y", "N"), labels = c("Yes", "No")))
levels(df_processed$Loan_Status)
## [1] "Yes" "No"

# set seed
set.seed(1234)

# split data
index <- caret::createDataPartition(df_processed$Loan_Status, p = 0.7, list = FALSE)
trainSet <- df_processed[index, ]
testSet <- df_processed[-index, ]

# define the training control - let's do 10-fold cross-validation
train.control <- caret::trainControl(
        method = "cv",
        number = 10,
        savePredictions = 'final',
        classProbs = TRUE)

# define IVs, DV
IV <- names(df_processed)[names(df_processed) %nin% c("Loan_ID", "Loan_Status")]
DV <- "Loan_Status"

# build model
model <- caret::train(trainSet[, IV], trainSet[, DV], method = 'glm', trControl = train.control)

# summary model output
summary(model)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2889  -0.7438  -0.5647   0.4431   2.1745  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -13.162035 623.299739  -0.021  0.98315    
## GenderFemale            -0.210278   0.936934  -0.224  0.82242    
## GenderMale              -0.121769   0.884842  -0.138  0.89054    
## MarriedNo               13.756845 623.299635   0.022  0.98239    
## MarriedYes              13.250102 623.299664   0.021  0.98304    
## Dependents0             -0.872462   1.038756  -0.840  0.40096    
## Dependents1             -0.394333   1.061059  -0.372  0.71016    
## Dependents2             -1.116722   1.088138  -1.026  0.30477    
## Dependents3+            -1.114125   1.129840  -0.986  0.32409    
## EducationNot Graduate    0.415874   0.302300   1.376  0.16891    
## Self_EmployedNo          0.227842   0.593470   0.384  0.70104    
## Self_EmployedYes         0.358209   0.662960   0.540  0.58898    
## ApplicantIncome          0.009642   0.158041   0.061  0.95135    
## CoapplicantIncome        0.200371   0.139653   1.435  0.15135    
## LoanAmount               0.077609   0.146405   0.530  0.59605    
## Loan_Amount_Term         0.031466   0.131057   0.240  0.81025    
## Credit_History          -1.227525   0.161786  -7.587 3.27e-14 ***
## Property_AreaSemiurban  -0.836073   0.310128  -2.696  0.00702 ** 
## Property_AreaUrban      -0.291588   0.306034  -0.953  0.34069    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 535.87  on 430  degrees of freedom
## Residual deviance: 414.25  on 412  degrees of freedom
## AIC: 452.25
## 
## Number of Fisher Scoring iterations: 13

# making prediction using above model
testSet$prediction <- predict(object = model, testSet[, IV], type = "prob")$Yes

# add result prediction
testSet <- testSet %>%
        dplyr::mutate(Loan_Status_Predicted = ifelse(prediction > .5, "Y", "N") %>%
                              factor(., levels = c("Y", "N"), labels = c("Yes", "No")))

# confusion matrix
caret::confusionMatrix(testSet$Loan_Status, testSet$Loan_Status_Predicted)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes 126   0
##        No   27  30
##                                           
##                Accuracy : 0.8525          
##                  95% CI : (0.7926, 0.9005)
##     No Information Rate : 0.8361          
##     P-Value [Acc > NIR] : 0.3149          
##                                           
##                   Kappa : 0.6048          
##                                           
##  Mcnemar's Test P-Value : 5.624e-07       
##                                           
##             Sensitivity : 0.8235          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.5263          
##              Prevalence : 0.8361          
##          Detection Rate : 0.6885          
##    Detection Prevalence : 0.6885          
##       Balanced Accuracy : 0.9118          
##                                           
##        'Positive' Class : Yes             
## 

# tidy result
result <- broom::tidy(caret::confusionMatrix(testSet$Loan_Status, testSet$Loan_Status_Predicted))
result %>% kable(.)

term	class	estimate	conf.low	conf.high	p.value
accuracy	NA	0.8524590	0.7926416	0.9004632	0.3149022
kappa	NA	0.6047516	NA	NA	0.0000006
sensitivity	Yes	0.8235294	NA	NA	NA
specificity	Yes	1.0000000	NA	NA	NA
pos_pred_value	Yes	1.0000000	NA	NA	NA
neg_pred_value	Yes	0.5263158	NA	NA	NA
precision	Yes	1.0000000	NA	NA	NA
recall	Yes	0.8235294	NA	NA	NA
f1	Yes	0.9032258	NA	NA	NA
prevalence	Yes	0.8360656	NA	NA	NA
detection_rate	Yes	0.6885246	NA	NA	NA
detection_prevalence	Yes	0.6885246	NA	NA	NA
balanced_accuracy	Yes	0.9117647	NA	NA	NA

data_621_blog_1

Jimmy Ng

4/18/2020

Logistic Regression

Summary