Logistic and Its Versions

#=================================
#       Data Pre-processing
#=================================

# Load some packages for data manipulation: 
library(tidyverse)
library(magrittr)

# Clear workspace: 
rm(list = ls())

# Use GermanCredit: 

library(caret)
data("GermanCredit")
set.seed(1)
id <- createDataPartition(y = GermanCredit$Class, p = 0.7, list = FALSE)
df_train <- GermanCredit[id, ]
df_test <- GermanCredit[-id, ]


# Activate h2o package for using: 
library(h2o)
h2o.init(nthreads = 16, max_mem_size = "16g")
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /tmp/RtmpovArcZ/h2o_chidung_started_from_r.out
##     /tmp/RtmpovArcZ/h2o_chidung_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 990 milliseconds 
##     H2O cluster timezone:       Asia/Ho_Chi_Minh 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.8 
##     H2O cluster version age:    2 months and 24 days  
##     H2O cluster name:           H2O_started_from_R_chidung_yxt660 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   16.00 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  16 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)
h2o.no_progress()

# Convert to h2o Frame and identify inputs and output: 
test <- as.h2o(df_test)
train <- as.h2o(df_train)

response <- "Class"
predictors <- setdiff(names(train), response)


#===================================
#   Pure, Lasso and Ridge Logistic
#===================================

# Train Logistic Model: 
pure_logistic <- h2o.glm(family= "binomial", 
                         x = predictors, 
                         y = response, 
                         lambda = 0, 
                         training_frame = train)
                         
                      
# Function Shows the coefficients table: 

show_coeffs <- function(model_selected) {
  model_selected@model$coefficients_table %>% 
    as.data.frame() %>% 
    mutate_if(is.numeric, function(x) {round(x, 3)}) %>% 
    filter(coefficients != 0) %>% 
    knitr::kable()
}


# Use this function: 
show_coeffs(pure_logistic)
names coefficients standardized_coefficients
Intercept 5.549 1.435
Duration -0.035 -0.411
InstallmentRatePercentage -0.392 -0.440
ResidenceDuration 0.031 0.034
Age 0.016 0.186
NumberExistingCredits -0.135 -0.080
NumberPeopleMaintenance -0.131 -0.048
Telephone -0.197 -0.097
ForeignWorker -2.013 -0.414
CheckingAccountStatus.lt.0 -0.764 -0.342
CheckingAccountStatus.0.to.200 -0.474 -0.208
CheckingAccountStatus.gt.200 0.369 0.090
CheckingAccountStatus.none 0.924 0.453
CreditHistory.NoCredit.AllPaid -1.701 -0.322
CreditHistory.ThisBank.AllPaid -0.885 -0.201
CreditHistory.PaidDuly -0.117 -0.058
CreditHistory.Delay -0.325 -0.090
CreditHistory.Critical 0.762 0.350
Purpose.NewCar -0.884 -0.382
Purpose.UsedCar 1.022 0.309
Purpose.Furniture.Equipment 0.137 0.052
Purpose.Radio.Television 0.180 0.081
Purpose.DomesticAppliance 0.036 0.004
Purpose.Repairs -0.415 -0.062
Purpose.Education -0.629 -0.137
Purpose.Retraining 9.294 0.857
Purpose.Business -0.218 -0.066
Purpose.Other -0.835 -0.063
SavingsAccountBonds.lt.100 -0.669 -0.327
SavingsAccountBonds.100.to.500 0.326 0.104
SavingsAccountBonds.500.to.1000 -0.066 -0.017
SavingsAccountBonds.gt.1000 0.620 0.133
SavingsAccountBonds.Unknown 0.751 0.278
EmploymentDuration.lt.1 -0.258 -0.099
EmploymentDuration.1.to.4 0.037 0.017
EmploymentDuration.4.to.7 0.538 0.209
EmploymentDuration.gt.7 -0.157 -0.069
EmploymentDuration.Unemployed -0.385 -0.089
Personal.Male.Divorced.Seperated -0.183 -0.038
Personal.Female.NotSingle -0.216 -0.100
Personal.Male.Single 0.271 0.135
Personal.Male.Married.Widowed -0.146 -0.044
OtherDebtorsGuarantors.None -0.184 -0.054
OtherDebtorsGuarantors.CoApplicant -0.955 -0.181
OtherDebtorsGuarantors.Guarantor 0.909 0.214
Property.RealEstate 0.279 0.125
Property.Insurance -0.268 -0.115
Property.CarOther 0.138 0.065
Property.Unknown -0.308 -0.108
OtherInstallmentPlans.Bank -0.534 -0.182
OtherInstallmentPlans.Stores -0.284 -0.062
OtherInstallmentPlans.None 0.503 0.195
Housing.Rent -0.337 -0.132
Housing.Own -0.039 -0.017
Housing.ForFree 0.708 0.206
Job.UnemployedUnskilled 0.458 0.070
Job.UnskilledResident -0.072 -0.030
Job.SkilledEmployee -0.034 -0.017
Job.Management.SelfEmp.HighlyQualified 0.075 0.027
# Lasso Logistic Model: 

lasso_logistic <- h2o.glm(family = "binomial", 
                          alpha = 1,
                          seed = 1988, 
                          x = predictors, 
                          y = response, 
                          training_frame = train)

show_coeffs(lasso_logistic)
names coefficients standardized_coefficients
Intercept 2.160 1.098
Duration -0.029 -0.338
InstallmentRatePercentage -0.167 -0.187
Age 0.005 0.053
ForeignWorker -0.734 -0.151
CheckingAccountStatus.lt.0 -0.175 -0.078
CheckingAccountStatus.gt.200 0.510 0.125
CheckingAccountStatus.none 1.157 0.567
CreditHistory.NoCredit.AllPaid -1.042 -0.197
CreditHistory.ThisBank.AllPaid -0.300 -0.068
CreditHistory.Critical 0.581 0.267
Purpose.NewCar -0.459 -0.198
Purpose.UsedCar 0.632 0.191
Purpose.Radio.Television 0.175 0.079
Purpose.Education -0.124 -0.027
Purpose.Retraining 0.824 0.076
SavingsAccountBonds.lt.100 -0.637 -0.312
SavingsAccountBonds.Unknown 0.095 0.035
EmploymentDuration.lt.1 -0.219 -0.084
EmploymentDuration.4.to.7 0.190 0.074
Personal.Female.NotSingle -0.013 -0.006
Personal.Male.Single 0.069 0.034
OtherDebtorsGuarantors.CoApplicant -0.280 -0.053
OtherDebtorsGuarantors.Guarantor 0.534 0.125
Property.RealEstate 0.143 0.064
Property.Insurance -0.020 -0.008
OtherInstallmentPlans.None 0.655 0.254
Housing.Rent -0.168 -0.066
# Ridge Logistic Model: 
ridge_logistic <- h2o.glm(family = "binomial", 
                          alpha = 0,
                          seed = 1988, 
                          x = predictors, 
                          y = response, 
                          training_frame = train)

show_coeffs(ridge_logistic)
names coefficients standardized_coefficients
Intercept 4.364 1.278
Duration -0.032 -0.374
InstallmentRatePercentage -0.305 -0.342
ResidenceDuration 0.028 0.031
Age 0.013 0.156
NumberExistingCredits -0.085 -0.050
NumberPeopleMaintenance -0.091 -0.033
Telephone -0.150 -0.074
ForeignWorker -1.510 -0.311
CheckingAccountStatus.lt.0 -0.697 -0.312
CheckingAccountStatus.0.to.200 -0.432 -0.189
CheckingAccountStatus.gt.200 0.372 0.091
CheckingAccountStatus.none 0.833 0.408
CreditHistory.NoCredit.AllPaid -1.489 -0.282
CreditHistory.ThisBank.AllPaid -0.732 -0.166
CreditHistory.PaidDuly -0.093 -0.046
CreditHistory.Delay -0.303 -0.084
CreditHistory.Critical 0.653 0.299
Purpose.NewCar -0.695 -0.300
Purpose.UsedCar 0.898 0.271
Purpose.Furniture.Equipment 0.167 0.063
Purpose.Radio.Television 0.239 0.107
Purpose.DomesticAppliance 0.089 0.009
Purpose.Repairs -0.335 -0.050
Purpose.Education -0.491 -0.107
Purpose.Retraining 2.821 0.260
Purpose.Business -0.160 -0.048
Purpose.Other -0.705 -0.053
SavingsAccountBonds.lt.100 -0.580 -0.284
SavingsAccountBonds.100.to.500 0.265 0.084
SavingsAccountBonds.500.to.1000 0.067 0.017
SavingsAccountBonds.gt.1000 0.505 0.109
SavingsAccountBonds.Unknown 0.616 0.228
EmploymentDuration.lt.1 -0.265 -0.102
EmploymentDuration.1.to.4 0.037 0.017
EmploymentDuration.4.to.7 0.445 0.173
EmploymentDuration.gt.7 -0.106 -0.046
EmploymentDuration.Unemployed -0.287 -0.067
Personal.Male.Divorced.Seperated -0.159 -0.033
Personal.Female.NotSingle -0.178 -0.082
Personal.Male.Single 0.210 0.105
Personal.Male.Married.Widowed -0.080 -0.024
OtherDebtorsGuarantors.None -0.152 -0.045
OtherDebtorsGuarantors.CoApplicant -0.828 -0.157
OtherDebtorsGuarantors.Guarantor 0.775 0.182
Property.RealEstate 0.245 0.110
Property.Insurance -0.230 -0.099
Property.CarOther 0.082 0.039
Property.Unknown -0.206 -0.072
OtherInstallmentPlans.Bank -0.466 -0.159
OtherInstallmentPlans.Stores -0.298 -0.065
OtherInstallmentPlans.None 0.454 0.176
Housing.Rent -0.273 -0.107
Housing.Own 0.007 0.003
Housing.ForFree 0.482 0.140
Job.UnemployedUnskilled 0.275 0.042
Job.UnskilledResident -0.031 -0.013
Job.SkilledEmployee -0.026 -0.013
Job.Management.SelfEmp.HighlyQualified 0.038 0.014
# Function shows model performance on test data: 

my_cm <- function(model_selected) {
  pred <- h2o.predict(model_selected, test) %>% 
    as.data.frame() %>% 
    pull(1)
  confusionMatrix(pred, df_test$Class, positive = "Bad") %>% 
    return()
}

lapply(list(pure_logistic, lasso_logistic, ridge_logistic), my_cm)
## [[1]]
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Bad Good
##       Bad   26   17
##       Good  64  193
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.676, 0.7794)
##     No Information Rate : 0.7            
##     P-Value [Acc > NIR] : 0.1418         
##                                          
##                   Kappa : 0.2444         
##  Mcnemar's Test P-Value : 3.203e-07      
##                                          
##             Sensitivity : 0.28889        
##             Specificity : 0.91905        
##          Pos Pred Value : 0.60465        
##          Neg Pred Value : 0.75097        
##              Prevalence : 0.30000        
##          Detection Rate : 0.08667        
##    Detection Prevalence : 0.14333        
##       Balanced Accuracy : 0.60397        
##                                          
##        'Positive' Class : Bad            
##                                          
## 
## [[2]]
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Bad Good
##       Bad   32   31
##       Good  58  179
##                                           
##                Accuracy : 0.7033          
##                  95% CI : (0.6481, 0.7545)
##     No Information Rate : 0.7             
##     P-Value [Acc > NIR] : 0.478232        
##                                           
##                   Kappa : 0.2274          
##  Mcnemar's Test P-Value : 0.005851        
##                                           
##             Sensitivity : 0.3556          
##             Specificity : 0.8524          
##          Pos Pred Value : 0.5079          
##          Neg Pred Value : 0.7553          
##              Prevalence : 0.3000          
##          Detection Rate : 0.1067          
##    Detection Prevalence : 0.2100          
##       Balanced Accuracy : 0.6040          
##                                           
##        'Positive' Class : Bad             
##                                           
## 
## [[3]]
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Bad Good
##       Bad   29   20
##       Good  61  190
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.676, 0.7794)
##     No Information Rate : 0.7            
##     P-Value [Acc > NIR] : 0.1418         
##                                          
##                   Kappa : 0.2609         
##  Mcnemar's Test P-Value : 8.812e-06      
##                                          
##             Sensitivity : 0.32222        
##             Specificity : 0.90476        
##          Pos Pred Value : 0.59184        
##          Neg Pred Value : 0.75697        
##              Prevalence : 0.30000        
##          Detection Rate : 0.09667        
##    Detection Prevalence : 0.16333        
##       Balanced Accuracy : 0.61349        
##                                          
##        'Positive' Class : Bad            
## 
LS0tCnRpdGxlOiAiTGFzc28gYW5kIFJpZGdlIExvZ2lzdGljIChoMm8gUGFja2FnZSkiIApzdWJ0aXRsZTogIlIgZm9yIEtpbGxpbmcgUG5ldW1vbmlhIgphdXRob3I6ICJOZ3V5ZW4gQ2hpIER1bmciCm91dHB1dDoKICBodG1sX2RvY3VtZW50OiAKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgICMgY29kZV9mb2xkaW5nOiBoaWRlCiAgICBoaWdobGlnaHQ6IHB5Z21lbnRzCiAgICAjIG51bWJlcl9zZWN0aW9uczogeWVzCiAgICB0aGVtZTogImZsYXRseSIKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCi0tLQoKYGBge3Igc2V0dXAsaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSkKYGBgCgoKIyBMb2dpc3RpYyBhbmQgSXRzIFZlcnNpb25zCgpgYGB7cn0KIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQojICAgICAgIERhdGEgUHJlLXByb2Nlc3NpbmcKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKIyBMb2FkIHNvbWUgcGFja2FnZXMgZm9yIGRhdGEgbWFuaXB1bGF0aW9uOiAKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkobWFncml0dHIpCgojIENsZWFyIHdvcmtzcGFjZTogCnJtKGxpc3QgPSBscygpKQoKIyBVc2UgR2VybWFuQ3JlZGl0OiAKCmxpYnJhcnkoY2FyZXQpCmRhdGEoIkdlcm1hbkNyZWRpdCIpCnNldC5zZWVkKDEpCmlkIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oeSA9IEdlcm1hbkNyZWRpdCRDbGFzcywgcCA9IDAuNywgbGlzdCA9IEZBTFNFKQpkZl90cmFpbiA8LSBHZXJtYW5DcmVkaXRbaWQsIF0KZGZfdGVzdCA8LSBHZXJtYW5DcmVkaXRbLWlkLCBdCgoKIyBBY3RpdmF0ZSBoMm8gcGFja2FnZSBmb3IgdXNpbmc6IApsaWJyYXJ5KGgybykKaDJvLmluaXQobnRocmVhZHMgPSAxNiwgbWF4X21lbV9zaXplID0gIjE2ZyIpCmgyby5ub19wcm9ncmVzcygpCgojIENvbnZlcnQgdG8gaDJvIEZyYW1lIGFuZCBpZGVudGlmeSBpbnB1dHMgYW5kIG91dHB1dDogCnRlc3QgPC0gYXMuaDJvKGRmX3Rlc3QpCnRyYWluIDwtIGFzLmgybyhkZl90cmFpbikKCnJlc3BvbnNlIDwtICJDbGFzcyIKcHJlZGljdG9ycyA8LSBzZXRkaWZmKG5hbWVzKHRyYWluKSwgcmVzcG9uc2UpCgoKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgICBQdXJlLCBMYXNzbyBhbmQgUmlkZ2UgTG9naXN0aWMKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CgojIFRyYWluIExvZ2lzdGljIE1vZGVsOiAKcHVyZV9sb2dpc3RpYyA8LSBoMm8uZ2xtKGZhbWlseT0gImJpbm9taWFsIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gcHJlZGljdG9ycywgCiAgICAgICAgICAgICAgICAgICAgICAgICB5ID0gcmVzcG9uc2UsIAogICAgICAgICAgICAgICAgICAgICAgICAgbGFtYmRhID0gMCwgCiAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZ19mcmFtZSA9IHRyYWluKQogICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAKIyBGdW5jdGlvbiBTaG93cyB0aGUgY29lZmZpY2llbnRzIHRhYmxlOiAKCnNob3dfY29lZmZzIDwtIGZ1bmN0aW9uKG1vZGVsX3NlbGVjdGVkKSB7CiAgbW9kZWxfc2VsZWN0ZWRAbW9kZWwkY29lZmZpY2llbnRzX3RhYmxlICU+JSAKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBtdXRhdGVfaWYoaXMubnVtZXJpYywgZnVuY3Rpb24oeCkge3JvdW5kKHgsIDMpfSkgJT4lIAogICAgZmlsdGVyKGNvZWZmaWNpZW50cyAhPSAwKSAlPiUgCiAgICBrbml0cjo6a2FibGUoKQp9CgoKIyBVc2UgdGhpcyBmdW5jdGlvbjogCnNob3dfY29lZmZzKHB1cmVfbG9naXN0aWMpCgojIExhc3NvIExvZ2lzdGljIE1vZGVsOiAKCmxhc3NvX2xvZ2lzdGljIDwtIGgyby5nbG0oZmFtaWx5ID0gImJpbm9taWFsIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgYWxwaGEgPSAxLAogICAgICAgICAgICAgICAgICAgICAgICAgIHNlZWQgPSAxOTg4LCAKICAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gcHJlZGljdG9ycywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgeSA9IHJlc3BvbnNlLCAKICAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZ19mcmFtZSA9IHRyYWluKQoKc2hvd19jb2VmZnMobGFzc29fbG9naXN0aWMpCgojIFJpZGdlIExvZ2lzdGljIE1vZGVsOiAKcmlkZ2VfbG9naXN0aWMgPC0gaDJvLmdsbShmYW1pbHkgPSAiYmlub21pYWwiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICBhbHBoYSA9IDAsCiAgICAgICAgICAgICAgICAgICAgICAgICAgc2VlZCA9IDE5ODgsIAogICAgICAgICAgICAgICAgICAgICAgICAgIHggPSBwcmVkaWN0b3JzLCAKICAgICAgICAgICAgICAgICAgICAgICAgICB5ID0gcmVzcG9uc2UsIAogICAgICAgICAgICAgICAgICAgICAgICAgIHRyYWluaW5nX2ZyYW1lID0gdHJhaW4pCgpzaG93X2NvZWZmcyhyaWRnZV9sb2dpc3RpYykKCiMgRnVuY3Rpb24gc2hvd3MgbW9kZWwgcGVyZm9ybWFuY2Ugb24gdGVzdCBkYXRhOiAKCm15X2NtIDwtIGZ1bmN0aW9uKG1vZGVsX3NlbGVjdGVkKSB7CiAgcHJlZCA8LSBoMm8ucHJlZGljdChtb2RlbF9zZWxlY3RlZCwgdGVzdCkgJT4lIAogICAgYXMuZGF0YS5mcmFtZSgpICU+JSAKICAgIHB1bGwoMSkKICBjb25mdXNpb25NYXRyaXgocHJlZCwgZGZfdGVzdCRDbGFzcywgcG9zaXRpdmUgPSAiQmFkIikgJT4lIAogICAgcmV0dXJuKCkKfQoKbGFwcGx5KGxpc3QocHVyZV9sb2dpc3RpYywgbGFzc29fbG9naXN0aWMsIHJpZGdlX2xvZ2lzdGljKSwgbXlfY20pCmBgYAoKIyBSZWZlcmVuY2VzCgoxLiBodHRwOi8vZG9jcy5oMm8uYWkvaDJvL2xhdGVzdC1zdGFibGUvaDJvLWRvY3MvZGF0YS1zY2llbmNlL2dsbS5odG1sCjIuIGh0dHA6Ly9ycHVicy5jb20vbGVkb25nbmhhdG5hbS8yODg1NTYKCg==