Logistic and Its Versions
#=================================
# Data Pre-processing
#=================================
# Load some packages for data manipulation:
library(tidyverse)
library(magrittr)
# Clear workspace:
rm(list = ls())
# Use GermanCredit:
library(caret)
data("GermanCredit")
set.seed(1)
id <- createDataPartition(y = GermanCredit$Class, p = 0.7, list = FALSE)
df_train <- GermanCredit[id, ]
df_test <- GermanCredit[-id, ]
# Activate h2o package for using:
library(h2o)
h2o.init(nthreads = 16, max_mem_size = "16g")
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /tmp/RtmpovArcZ/h2o_chidung_started_from_r.out
## /tmp/RtmpovArcZ/h2o_chidung_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 seconds 990 milliseconds
## H2O cluster timezone: Asia/Ho_Chi_Minh
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.8
## H2O cluster version age: 2 months and 24 days
## H2O cluster name: H2O_started_from_R_chidung_yxt660
## H2O cluster total nodes: 1
## H2O cluster total memory: 16.00 GB
## H2O cluster total cores: 32
## H2O cluster allowed cores: 16
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
h2o.no_progress()
# Convert to h2o Frame and identify inputs and output:
test <- as.h2o(df_test)
train <- as.h2o(df_train)
response <- "Class"
predictors <- setdiff(names(train), response)
#===================================
# Pure, Lasso and Ridge Logistic
#===================================
# Train Logistic Model:
pure_logistic <- h2o.glm(family= "binomial",
x = predictors,
y = response,
lambda = 0,
training_frame = train)
# Function Shows the coefficients table:
show_coeffs <- function(model_selected) {
model_selected@model$coefficients_table %>%
as.data.frame() %>%
mutate_if(is.numeric, function(x) {round(x, 3)}) %>%
filter(coefficients != 0) %>%
knitr::kable()
}
# Use this function:
show_coeffs(pure_logistic)
Intercept |
5.549 |
1.435 |
Duration |
-0.035 |
-0.411 |
InstallmentRatePercentage |
-0.392 |
-0.440 |
ResidenceDuration |
0.031 |
0.034 |
Age |
0.016 |
0.186 |
NumberExistingCredits |
-0.135 |
-0.080 |
NumberPeopleMaintenance |
-0.131 |
-0.048 |
Telephone |
-0.197 |
-0.097 |
ForeignWorker |
-2.013 |
-0.414 |
CheckingAccountStatus.lt.0 |
-0.764 |
-0.342 |
CheckingAccountStatus.0.to.200 |
-0.474 |
-0.208 |
CheckingAccountStatus.gt.200 |
0.369 |
0.090 |
CheckingAccountStatus.none |
0.924 |
0.453 |
CreditHistory.NoCredit.AllPaid |
-1.701 |
-0.322 |
CreditHistory.ThisBank.AllPaid |
-0.885 |
-0.201 |
CreditHistory.PaidDuly |
-0.117 |
-0.058 |
CreditHistory.Delay |
-0.325 |
-0.090 |
CreditHistory.Critical |
0.762 |
0.350 |
Purpose.NewCar |
-0.884 |
-0.382 |
Purpose.UsedCar |
1.022 |
0.309 |
Purpose.Furniture.Equipment |
0.137 |
0.052 |
Purpose.Radio.Television |
0.180 |
0.081 |
Purpose.DomesticAppliance |
0.036 |
0.004 |
Purpose.Repairs |
-0.415 |
-0.062 |
Purpose.Education |
-0.629 |
-0.137 |
Purpose.Retraining |
9.294 |
0.857 |
Purpose.Business |
-0.218 |
-0.066 |
Purpose.Other |
-0.835 |
-0.063 |
SavingsAccountBonds.lt.100 |
-0.669 |
-0.327 |
SavingsAccountBonds.100.to.500 |
0.326 |
0.104 |
SavingsAccountBonds.500.to.1000 |
-0.066 |
-0.017 |
SavingsAccountBonds.gt.1000 |
0.620 |
0.133 |
SavingsAccountBonds.Unknown |
0.751 |
0.278 |
EmploymentDuration.lt.1 |
-0.258 |
-0.099 |
EmploymentDuration.1.to.4 |
0.037 |
0.017 |
EmploymentDuration.4.to.7 |
0.538 |
0.209 |
EmploymentDuration.gt.7 |
-0.157 |
-0.069 |
EmploymentDuration.Unemployed |
-0.385 |
-0.089 |
Personal.Male.Divorced.Seperated |
-0.183 |
-0.038 |
Personal.Female.NotSingle |
-0.216 |
-0.100 |
Personal.Male.Single |
0.271 |
0.135 |
Personal.Male.Married.Widowed |
-0.146 |
-0.044 |
OtherDebtorsGuarantors.None |
-0.184 |
-0.054 |
OtherDebtorsGuarantors.CoApplicant |
-0.955 |
-0.181 |
OtherDebtorsGuarantors.Guarantor |
0.909 |
0.214 |
Property.RealEstate |
0.279 |
0.125 |
Property.Insurance |
-0.268 |
-0.115 |
Property.CarOther |
0.138 |
0.065 |
Property.Unknown |
-0.308 |
-0.108 |
OtherInstallmentPlans.Bank |
-0.534 |
-0.182 |
OtherInstallmentPlans.Stores |
-0.284 |
-0.062 |
OtherInstallmentPlans.None |
0.503 |
0.195 |
Housing.Rent |
-0.337 |
-0.132 |
Housing.Own |
-0.039 |
-0.017 |
Housing.ForFree |
0.708 |
0.206 |
Job.UnemployedUnskilled |
0.458 |
0.070 |
Job.UnskilledResident |
-0.072 |
-0.030 |
Job.SkilledEmployee |
-0.034 |
-0.017 |
Job.Management.SelfEmp.HighlyQualified |
0.075 |
0.027 |
# Lasso Logistic Model:
lasso_logistic <- h2o.glm(family = "binomial",
alpha = 1,
seed = 1988,
x = predictors,
y = response,
training_frame = train)
show_coeffs(lasso_logistic)
Intercept |
2.160 |
1.098 |
Duration |
-0.029 |
-0.338 |
InstallmentRatePercentage |
-0.167 |
-0.187 |
Age |
0.005 |
0.053 |
ForeignWorker |
-0.734 |
-0.151 |
CheckingAccountStatus.lt.0 |
-0.175 |
-0.078 |
CheckingAccountStatus.gt.200 |
0.510 |
0.125 |
CheckingAccountStatus.none |
1.157 |
0.567 |
CreditHistory.NoCredit.AllPaid |
-1.042 |
-0.197 |
CreditHistory.ThisBank.AllPaid |
-0.300 |
-0.068 |
CreditHistory.Critical |
0.581 |
0.267 |
Purpose.NewCar |
-0.459 |
-0.198 |
Purpose.UsedCar |
0.632 |
0.191 |
Purpose.Radio.Television |
0.175 |
0.079 |
Purpose.Education |
-0.124 |
-0.027 |
Purpose.Retraining |
0.824 |
0.076 |
SavingsAccountBonds.lt.100 |
-0.637 |
-0.312 |
SavingsAccountBonds.Unknown |
0.095 |
0.035 |
EmploymentDuration.lt.1 |
-0.219 |
-0.084 |
EmploymentDuration.4.to.7 |
0.190 |
0.074 |
Personal.Female.NotSingle |
-0.013 |
-0.006 |
Personal.Male.Single |
0.069 |
0.034 |
OtherDebtorsGuarantors.CoApplicant |
-0.280 |
-0.053 |
OtherDebtorsGuarantors.Guarantor |
0.534 |
0.125 |
Property.RealEstate |
0.143 |
0.064 |
Property.Insurance |
-0.020 |
-0.008 |
OtherInstallmentPlans.None |
0.655 |
0.254 |
Housing.Rent |
-0.168 |
-0.066 |
# Ridge Logistic Model:
ridge_logistic <- h2o.glm(family = "binomial",
alpha = 0,
seed = 1988,
x = predictors,
y = response,
training_frame = train)
show_coeffs(ridge_logistic)
Intercept |
4.364 |
1.278 |
Duration |
-0.032 |
-0.374 |
InstallmentRatePercentage |
-0.305 |
-0.342 |
ResidenceDuration |
0.028 |
0.031 |
Age |
0.013 |
0.156 |
NumberExistingCredits |
-0.085 |
-0.050 |
NumberPeopleMaintenance |
-0.091 |
-0.033 |
Telephone |
-0.150 |
-0.074 |
ForeignWorker |
-1.510 |
-0.311 |
CheckingAccountStatus.lt.0 |
-0.697 |
-0.312 |
CheckingAccountStatus.0.to.200 |
-0.432 |
-0.189 |
CheckingAccountStatus.gt.200 |
0.372 |
0.091 |
CheckingAccountStatus.none |
0.833 |
0.408 |
CreditHistory.NoCredit.AllPaid |
-1.489 |
-0.282 |
CreditHistory.ThisBank.AllPaid |
-0.732 |
-0.166 |
CreditHistory.PaidDuly |
-0.093 |
-0.046 |
CreditHistory.Delay |
-0.303 |
-0.084 |
CreditHistory.Critical |
0.653 |
0.299 |
Purpose.NewCar |
-0.695 |
-0.300 |
Purpose.UsedCar |
0.898 |
0.271 |
Purpose.Furniture.Equipment |
0.167 |
0.063 |
Purpose.Radio.Television |
0.239 |
0.107 |
Purpose.DomesticAppliance |
0.089 |
0.009 |
Purpose.Repairs |
-0.335 |
-0.050 |
Purpose.Education |
-0.491 |
-0.107 |
Purpose.Retraining |
2.821 |
0.260 |
Purpose.Business |
-0.160 |
-0.048 |
Purpose.Other |
-0.705 |
-0.053 |
SavingsAccountBonds.lt.100 |
-0.580 |
-0.284 |
SavingsAccountBonds.100.to.500 |
0.265 |
0.084 |
SavingsAccountBonds.500.to.1000 |
0.067 |
0.017 |
SavingsAccountBonds.gt.1000 |
0.505 |
0.109 |
SavingsAccountBonds.Unknown |
0.616 |
0.228 |
EmploymentDuration.lt.1 |
-0.265 |
-0.102 |
EmploymentDuration.1.to.4 |
0.037 |
0.017 |
EmploymentDuration.4.to.7 |
0.445 |
0.173 |
EmploymentDuration.gt.7 |
-0.106 |
-0.046 |
EmploymentDuration.Unemployed |
-0.287 |
-0.067 |
Personal.Male.Divorced.Seperated |
-0.159 |
-0.033 |
Personal.Female.NotSingle |
-0.178 |
-0.082 |
Personal.Male.Single |
0.210 |
0.105 |
Personal.Male.Married.Widowed |
-0.080 |
-0.024 |
OtherDebtorsGuarantors.None |
-0.152 |
-0.045 |
OtherDebtorsGuarantors.CoApplicant |
-0.828 |
-0.157 |
OtherDebtorsGuarantors.Guarantor |
0.775 |
0.182 |
Property.RealEstate |
0.245 |
0.110 |
Property.Insurance |
-0.230 |
-0.099 |
Property.CarOther |
0.082 |
0.039 |
Property.Unknown |
-0.206 |
-0.072 |
OtherInstallmentPlans.Bank |
-0.466 |
-0.159 |
OtherInstallmentPlans.Stores |
-0.298 |
-0.065 |
OtherInstallmentPlans.None |
0.454 |
0.176 |
Housing.Rent |
-0.273 |
-0.107 |
Housing.Own |
0.007 |
0.003 |
Housing.ForFree |
0.482 |
0.140 |
Job.UnemployedUnskilled |
0.275 |
0.042 |
Job.UnskilledResident |
-0.031 |
-0.013 |
Job.SkilledEmployee |
-0.026 |
-0.013 |
Job.Management.SelfEmp.HighlyQualified |
0.038 |
0.014 |
# Function shows model performance on test data:
my_cm <- function(model_selected) {
pred <- h2o.predict(model_selected, test) %>%
as.data.frame() %>%
pull(1)
confusionMatrix(pred, df_test$Class, positive = "Bad") %>%
return()
}
lapply(list(pure_logistic, lasso_logistic, ridge_logistic), my_cm)
## [[1]]
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 26 17
## Good 64 193
##
## Accuracy : 0.73
## 95% CI : (0.676, 0.7794)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.1418
##
## Kappa : 0.2444
## Mcnemar's Test P-Value : 3.203e-07
##
## Sensitivity : 0.28889
## Specificity : 0.91905
## Pos Pred Value : 0.60465
## Neg Pred Value : 0.75097
## Prevalence : 0.30000
## Detection Rate : 0.08667
## Detection Prevalence : 0.14333
## Balanced Accuracy : 0.60397
##
## 'Positive' Class : Bad
##
##
## [[2]]
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 32 31
## Good 58 179
##
## Accuracy : 0.7033
## 95% CI : (0.6481, 0.7545)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.478232
##
## Kappa : 0.2274
## Mcnemar's Test P-Value : 0.005851
##
## Sensitivity : 0.3556
## Specificity : 0.8524
## Pos Pred Value : 0.5079
## Neg Pred Value : 0.7553
## Prevalence : 0.3000
## Detection Rate : 0.1067
## Detection Prevalence : 0.2100
## Balanced Accuracy : 0.6040
##
## 'Positive' Class : Bad
##
##
## [[3]]
## Confusion Matrix and Statistics
##
## Reference
## Prediction Bad Good
## Bad 29 20
## Good 61 190
##
## Accuracy : 0.73
## 95% CI : (0.676, 0.7794)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.1418
##
## Kappa : 0.2609
## Mcnemar's Test P-Value : 8.812e-06
##
## Sensitivity : 0.32222
## Specificity : 0.90476
## Pos Pred Value : 0.59184
## Neg Pred Value : 0.75697
## Prevalence : 0.30000
## Detection Rate : 0.09667
## Detection Prevalence : 0.16333
## Balanced Accuracy : 0.61349
##
## 'Positive' Class : Bad
##
LS0tCnRpdGxlOiAiTGFzc28gYW5kIFJpZGdlIExvZ2lzdGljIChoMm8gUGFja2FnZSkiIApzdWJ0aXRsZTogIlIgZm9yIEtpbGxpbmcgUG5ldW1vbmlhIgphdXRob3I6ICJOZ3V5ZW4gQ2hpIER1bmciCm91dHB1dDoKICBodG1sX2RvY3VtZW50OiAKICAgIGNvZGVfZG93bmxvYWQ6IHRydWUKICAgICMgY29kZV9mb2xkaW5nOiBoaWRlCiAgICBoaWdobGlnaHQ6IHB5Z21lbnRzCiAgICAjIG51bWJlcl9zZWN0aW9uczogeWVzCiAgICB0aGVtZTogImZsYXRseSIKICAgIHRvYzogVFJVRQogICAgdG9jX2Zsb2F0OiBUUlVFCi0tLQoKYGBge3Igc2V0dXAsaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSkKYGBgCgoKIyBMb2dpc3RpYyBhbmQgSXRzIFZlcnNpb25zCgpgYGB7cn0KIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQojICAgICAgIERhdGEgUHJlLXByb2Nlc3NpbmcKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PQoKIyBMb2FkIHNvbWUgcGFja2FnZXMgZm9yIGRhdGEgbWFuaXB1bGF0aW9uOiAKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkobWFncml0dHIpCgojIENsZWFyIHdvcmtzcGFjZTogCnJtKGxpc3QgPSBscygpKQoKIyBVc2UgR2VybWFuQ3JlZGl0OiAKCmxpYnJhcnkoY2FyZXQpCmRhdGEoIkdlcm1hbkNyZWRpdCIpCnNldC5zZWVkKDEpCmlkIDwtIGNyZWF0ZURhdGFQYXJ0aXRpb24oeSA9IEdlcm1hbkNyZWRpdCRDbGFzcywgcCA9IDAuNywgbGlzdCA9IEZBTFNFKQpkZl90cmFpbiA8LSBHZXJtYW5DcmVkaXRbaWQsIF0KZGZfdGVzdCA8LSBHZXJtYW5DcmVkaXRbLWlkLCBdCgoKIyBBY3RpdmF0ZSBoMm8gcGFja2FnZSBmb3IgdXNpbmc6IApsaWJyYXJ5KGgybykKaDJvLmluaXQobnRocmVhZHMgPSAxNiwgbWF4X21lbV9zaXplID0gIjE2ZyIpCmgyby5ub19wcm9ncmVzcygpCgojIENvbnZlcnQgdG8gaDJvIEZyYW1lIGFuZCBpZGVudGlmeSBpbnB1dHMgYW5kIG91dHB1dDogCnRlc3QgPC0gYXMuaDJvKGRmX3Rlc3QpCnRyYWluIDwtIGFzLmgybyhkZl90cmFpbikKCnJlc3BvbnNlIDwtICJDbGFzcyIKcHJlZGljdG9ycyA8LSBzZXRkaWZmKG5hbWVzKHRyYWluKSwgcmVzcG9uc2UpCgoKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CiMgICBQdXJlLCBMYXNzbyBhbmQgUmlkZ2UgTG9naXN0aWMKIz09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09PT09CgojIFRyYWluIExvZ2lzdGljIE1vZGVsOiAKcHVyZV9sb2dpc3RpYyA8LSBoMm8uZ2xtKGZhbWlseT0gImJpbm9taWFsIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gcHJlZGljdG9ycywgCiAgICAgICAgICAgICAgICAgICAgICAgICB5ID0gcmVzcG9uc2UsIAogICAgICAgICAgICAgICAgICAgICAgICAgbGFtYmRhID0gMCwgCiAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZ19mcmFtZSA9IHRyYWluKQogICAgICAgICAgICAgICAgICAgICAgICAgCiAgICAgICAgICAgICAgICAgICAgICAKIyBGdW5jdGlvbiBTaG93cyB0aGUgY29lZmZpY2llbnRzIHRhYmxlOiAKCnNob3dfY29lZmZzIDwtIGZ1bmN0aW9uKG1vZGVsX3NlbGVjdGVkKSB7CiAgbW9kZWxfc2VsZWN0ZWRAbW9kZWwkY29lZmZpY2llbnRzX3RhYmxlICU+JSAKICAgIGFzLmRhdGEuZnJhbWUoKSAlPiUgCiAgICBtdXRhdGVfaWYoaXMubnVtZXJpYywgZnVuY3Rpb24oeCkge3JvdW5kKHgsIDMpfSkgJT4lIAogICAgZmlsdGVyKGNvZWZmaWNpZW50cyAhPSAwKSAlPiUgCiAgICBrbml0cjo6a2FibGUoKQp9CgoKIyBVc2UgdGhpcyBmdW5jdGlvbjogCnNob3dfY29lZmZzKHB1cmVfbG9naXN0aWMpCgojIExhc3NvIExvZ2lzdGljIE1vZGVsOiAKCmxhc3NvX2xvZ2lzdGljIDwtIGgyby5nbG0oZmFtaWx5ID0gImJpbm9taWFsIiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgYWxwaGEgPSAxLAogICAgICAgICAgICAgICAgICAgICAgICAgIHNlZWQgPSAxOTg4LCAKICAgICAgICAgICAgICAgICAgICAgICAgICB4ID0gcHJlZGljdG9ycywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgeSA9IHJlc3BvbnNlLCAKICAgICAgICAgICAgICAgICAgICAgICAgICB0cmFpbmluZ19mcmFtZSA9IHRyYWluKQoKc2hvd19jb2VmZnMobGFzc29fbG9naXN0aWMpCgojIFJpZGdlIExvZ2lzdGljIE1vZGVsOiAKcmlkZ2VfbG9naXN0aWMgPC0gaDJvLmdsbShmYW1pbHkgPSAiYmlub21pYWwiLCAKICAgICAgICAgICAgICAgICAgICAgICAgICBhbHBoYSA9IDAsCiAgICAgICAgICAgICAgICAgICAgICAgICAgc2VlZCA9IDE5ODgsIAogICAgICAgICAgICAgICAgICAgICAgICAgIHggPSBwcmVkaWN0b3JzLCAKICAgICAgICAgICAgICAgICAgICAgICAgICB5ID0gcmVzcG9uc2UsIAogICAgICAgICAgICAgICAgICAgICAgICAgIHRyYWluaW5nX2ZyYW1lID0gdHJhaW4pCgpzaG93X2NvZWZmcyhyaWRnZV9sb2dpc3RpYykKCiMgRnVuY3Rpb24gc2hvd3MgbW9kZWwgcGVyZm9ybWFuY2Ugb24gdGVzdCBkYXRhOiAKCm15X2NtIDwtIGZ1bmN0aW9uKG1vZGVsX3NlbGVjdGVkKSB7CiAgcHJlZCA8LSBoMm8ucHJlZGljdChtb2RlbF9zZWxlY3RlZCwgdGVzdCkgJT4lIAogICAgYXMuZGF0YS5mcmFtZSgpICU+JSAKICAgIHB1bGwoMSkKICBjb25mdXNpb25NYXRyaXgocHJlZCwgZGZfdGVzdCRDbGFzcywgcG9zaXRpdmUgPSAiQmFkIikgJT4lIAogICAgcmV0dXJuKCkKfQoKbGFwcGx5KGxpc3QocHVyZV9sb2dpc3RpYywgbGFzc29fbG9naXN0aWMsIHJpZGdlX2xvZ2lzdGljKSwgbXlfY20pCmBgYAoKIyBSZWZlcmVuY2VzCgoxLiBodHRwOi8vZG9jcy5oMm8uYWkvaDJvL2xhdGVzdC1zdGFibGUvaDJvLWRvY3MvZGF0YS1zY2llbmNlL2dsbS5odG1sCjIuIGh0dHA6Ly9ycHVicy5jb20vbGVkb25nbmhhdG5hbS8yODg1NTYKCg==