Regresja logistyczna jest modelem matematycznym, pozwalającym opisać wpływ kilku zmiennych \(X_1, X_2,...,X_k\) na dychotomiczną zmienną . Model regresji logistycznej:
Niech \(Y\) oznacza zmienną dychotomiczną, przyjmującą wartości:
• 1 – najczęściej dla zdarzeń pożądanych, jak np. przeżycie, wyzdrowienie, sukces
• 0 – w przeciwnym przypadku, np. zgon lub choroba, porażka Wówczas logistyczny model regresji określa równanie: \[(Y=1\mid x_1,x_2,...x_k)=P(X)=\frac{e^{a_0+\sum_{i=1}^ka_ix_i}}{1+e^{a_0+\sum_{i=1}^ka_ix_i}} \] Gdzie:
\(a_i, i=0,,,,k\) są współczynnikami regresji,
\(x_1,x_2,...x_k\) - zmienne niezależne, predyktory, zmienne wyjaśniające
Z racji, że zmienna zależna, wyjaśniana przyjmuje dwie, dychotomiczne wartości 0 i 1 równanie regresji określa prawdopodobieństwo wystąpienia danego zdarzenia dla wartości predyktorów wprowadzonych do modelu regresji logistycznej.
Przykład:
library(tidyverse)
library(lubridate)
credit<-read_csv(file="dane/dataset_31_credit-g.csv")
head(credit,10)
## # A tibble: 10 x 21
## checking_status duration credit_history purpose credit_amount savings_status
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 '<0' 6 'critical/oth… radio/… 1169 'no known sav…
## 2 '0<=X<200' 48 'existing pai… radio/… 5951 '<100'
## 3 'no checking' 12 'critical/oth… educat… 2096 '<100'
## 4 '<0' 42 'existing pai… furnit… 7882 '<100'
## 5 '<0' 24 'delayed prev… 'new c… 4870 '<100'
## 6 'no checking' 36 'existing pai… educat… 9055 'no known sav…
## 7 'no checking' 24 'existing pai… furnit… 2835 '500<=X<1000'
## 8 '0<=X<200' 36 'existing pai… 'used … 6948 '<100'
## 9 'no checking' 12 'existing pai… radio/… 3059 '>=1000'
## 10 '0<=X<200' 30 'critical/oth… 'new c… 5234 '<100'
## # … with 15 more variables: employment <chr>, installment_commitment <dbl>,
## # personal_status <chr>, other_parties <chr>, residence_since <dbl>,
## # property_magnitude <chr>, age <dbl>, other_payment_plans <chr>,
## # housing <chr>, existing_credits <dbl>, job <chr>, num_dependents <dbl>,
## # own_telephone <chr>, foreign_worker <chr>, class <chr>
dim(credit)
## [1] 1000 21
str(credit)
## tibble [1,000 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ checking_status : chr [1:1000] "'<0'" "'0<=X<200'" "'no checking'" "'<0'" ...
## $ duration : num [1:1000] 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : chr [1:1000] "'critical/other existing credit'" "'existing paid'" "'critical/other existing credit'" "'existing paid'" ...
## $ purpose : chr [1:1000] "radio/tv" "radio/tv" "education" "furniture/equipment" ...
## $ credit_amount : num [1:1000] 1169 5951 2096 7882 4870 ...
## $ savings_status : chr [1:1000] "'no known savings'" "'<100'" "'<100'" "'<100'" ...
## $ employment : chr [1:1000] "'>=7'" "'1<=X<4'" "'4<=X<7'" "'4<=X<7'" ...
## $ installment_commitment: num [1:1000] 4 2 2 2 3 2 3 2 2 4 ...
## $ personal_status : chr [1:1000] "'male single'" "'female div/dep/mar'" "'male single'" "'male single'" ...
## $ other_parties : chr [1:1000] "none" "none" "none" "guarantor" ...
## $ residence_since : num [1:1000] 4 2 3 4 4 4 4 2 4 2 ...
## $ property_magnitude : chr [1:1000] "'real estate'" "'real estate'" "'real estate'" "'life insurance'" ...
## $ age : num [1:1000] 67 22 49 45 53 35 53 35 61 28 ...
## $ other_payment_plans : chr [1:1000] "none" "none" "none" "none" ...
## $ housing : chr [1:1000] "own" "own" "own" "'for free'" ...
## $ existing_credits : num [1:1000] 2 1 1 1 2 1 1 1 1 2 ...
## $ job : chr [1:1000] "skilled" "skilled" "'unskilled resident'" "skilled" ...
## $ num_dependents : num [1:1000] 1 1 2 2 2 2 1 1 1 1 ...
## $ own_telephone : chr [1:1000] "yes" "none" "none" "none" ...
## $ foreign_worker : chr [1:1000] "yes" "yes" "yes" "yes" ...
## $ class : chr [1:1000] "good" "bad" "good" "good" ...
## - attr(*, "spec")=
## .. cols(
## .. checking_status = col_character(),
## .. duration = col_double(),
## .. credit_history = col_character(),
## .. purpose = col_character(),
## .. credit_amount = col_double(),
## .. savings_status = col_character(),
## .. employment = col_character(),
## .. installment_commitment = col_double(),
## .. personal_status = col_character(),
## .. other_parties = col_character(),
## .. residence_since = col_double(),
## .. property_magnitude = col_character(),
## .. age = col_double(),
## .. other_payment_plans = col_character(),
## .. housing = col_character(),
## .. existing_credits = col_double(),
## .. job = col_character(),
## .. num_dependents = col_double(),
## .. own_telephone = col_character(),
## .. foreign_worker = col_character(),
## .. class = col_character()
## .. )
credit$class<-ifelse(credit$class=="good",1,0)
credit
## # A tibble: 1,000 x 21
## checking_status duration credit_history purpose credit_amount savings_status
## <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 '<0' 6 'critical/oth… radio/… 1169 'no known sav…
## 2 '0<=X<200' 48 'existing pai… radio/… 5951 '<100'
## 3 'no checking' 12 'critical/oth… educat… 2096 '<100'
## 4 '<0' 42 'existing pai… furnit… 7882 '<100'
## 5 '<0' 24 'delayed prev… 'new c… 4870 '<100'
## 6 'no checking' 36 'existing pai… educat… 9055 'no known sav…
## 7 'no checking' 24 'existing pai… furnit… 2835 '500<=X<1000'
## 8 '0<=X<200' 36 'existing pai… 'used … 6948 '<100'
## 9 'no checking' 12 'existing pai… radio/… 3059 '>=1000'
## 10 '0<=X<200' 30 'critical/oth… 'new c… 5234 '<100'
## # … with 990 more rows, and 15 more variables: employment <chr>,
## # installment_commitment <dbl>, personal_status <chr>, other_parties <chr>,
## # residence_since <dbl>, property_magnitude <chr>, age <dbl>,
## # other_payment_plans <chr>, housing <chr>, existing_credits <dbl>,
## # job <chr>, num_dependents <dbl>, own_telephone <chr>, foreign_worker <chr>,
## # class <dbl>
str(credit$class)
## num [1:1000] 1 0 1 1 0 1 1 1 1 0 ...
set.seed(100)
index_train <- sample(1:nrow(credit), 2 / 3 * nrow(credit))
training_set <- credit[index_train, ]
test_set <- credit[-index_train, ]
reg_log<-glm(class~., data=training_set, family="binomial")
summary(reg_log)
##
## Call:
## glm(formula = class ~ ., family = "binomial", data = training_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6000 -0.6663 0.3406 0.6853 2.2653
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 7.970e-01 1.780e+00 0.448
## checking_status'>=200' 1.133e+00 4.746e-01 2.388
## checking_status'0<=X<200' 4.592e-01 2.721e-01 1.688
## checking_status'no checking' 1.796e+00 2.906e-01 6.180
## duration -2.781e-02 1.198e-02 -2.321
## credit_history'critical/other existing credit' 1.411e+00 5.607e-01 2.516
## credit_history'delayed previously' 3.751e-01 6.014e-01 0.624
## credit_history'existing paid' 8.687e-01 5.088e-01 1.707
## credit_history'no credits/all paid' -3.423e-01 7.083e-01 -0.483
## purpose'new car' 1.560e-01 9.567e-01 0.163
## purpose'used car' 1.659e+00 1.023e+00 1.621
## purposebusiness 1.236e+00 1.012e+00 1.221
## purposeeducation 2.249e-01 1.058e+00 0.212
## purposefurniture/equipment 6.356e-01 9.588e-01 0.663
## purposeother 3.486e+00 1.746e+00 1.997
## purposeradio/tv 9.533e-01 9.604e-01 0.993
## purposerepairs -3.301e-01 1.113e+00 -0.297
## purposeretraining 1.588e+01 5.301e+02 0.030
## credit_amount -1.533e-04 5.637e-05 -2.719
## savings_status'>=1000' 1.492e+00 5.980e-01 2.494
## savings_status'100<=X<500' 3.037e-01 3.633e-01 0.836
## savings_status'500<=X<1000' 1.800e-01 4.742e-01 0.379
## savings_status'no known savings' 8.091e-01 3.185e-01 2.541
## employment'>=7' 3.127e-01 3.660e-01 0.854
## employment'1<=X<4' 7.450e-02 2.996e-01 0.249
## employment'4<=X<7' 7.288e-01 3.695e-01 1.972
## employmentunemployed 1.109e-01 5.302e-01 0.209
## installment_commitment -3.840e-01 1.090e-01 -3.521
## personal_status'male div/sep' -9.825e-01 5.228e-01 -1.879
## personal_status'male mar/wid' -1.985e-01 3.751e-01 -0.529
## personal_status'male single' 4.054e-01 2.581e-01 1.571
## other_partiesguarantor 1.604e+00 7.315e-01 2.193
## other_partiesnone 3.254e-01 4.896e-01 0.665
## residence_since 5.665e-02 1.065e-01 0.532
## property_magnitude'no known property' -1.092e-01 5.127e-01 -0.213
## property_magnitude'real estate' 5.432e-01 3.170e-01 1.714
## property_magnitudecar 4.690e-01 2.884e-01 1.626
## age 1.773e-02 1.163e-02 1.524
## other_payment_plansnone 9.634e-01 3.040e-01 3.169
## other_payment_plansstores 8.834e-01 5.392e-01 1.638
## housingown -9.564e-02 5.608e-01 -0.171
## housingrent -6.305e-01 5.847e-01 -1.078
## existing_credits -2.890e-01 2.333e-01 -1.239
## job'unemp/unskilled non res' -2.434e-02 9.036e-01 -0.027
## job'unskilled resident' -4.834e-01 4.652e-01 -1.039
## jobskilled -4.575e-01 3.817e-01 -1.199
## num_dependents -2.254e-01 3.207e-01 -0.703
## own_telephoneyes 1.311e-01 2.452e-01 0.534
## foreign_workeryes -1.726e+00 7.682e-01 -2.246
## Pr(>|z|)
## (Intercept) 0.654270
## checking_status'>=200' 0.016950 *
## checking_status'0<=X<200' 0.091418 .
## checking_status'no checking' 6.4e-10 ***
## duration 0.020278 *
## credit_history'critical/other existing credit' 0.011858 *
## credit_history'delayed previously' 0.532812
## credit_history'existing paid' 0.087750 .
## credit_history'no credits/all paid' 0.628956
## purpose'new car' 0.870508
## purpose'used car' 0.105031
## purposebusiness 0.222062
## purposeeducation 0.831759
## purposefurniture/equipment 0.507389
## purposeother 0.045868 *
## purposeradio/tv 0.320929
## purposerepairs 0.766751
## purposeretraining 0.976107
## credit_amount 0.006552 **
## savings_status'>=1000' 0.012614 *
## savings_status'100<=X<500' 0.403073
## savings_status'500<=X<1000' 0.704335
## savings_status'no known savings' 0.011062 *
## employment'>=7' 0.392992
## employment'1<=X<4' 0.803619
## employment'4<=X<7' 0.048577 *
## employmentunemployed 0.834290
## installment_commitment 0.000429 ***
## personal_status'male div/sep' 0.060230 .
## personal_status'male mar/wid' 0.596665
## personal_status'male single' 0.116255
## other_partiesguarantor 0.028328 *
## other_partiesnone 0.506232
## residence_since 0.594840
## property_magnitude'no known property' 0.831268
## property_magnitude'real estate' 0.086601 .
## property_magnitudecar 0.103925
## age 0.127455
## other_payment_plansnone 0.001530 **
## other_payment_plansstores 0.101352
## housingown 0.864587
## housingrent 0.280862
## existing_credits 0.215414
## job'unemp/unskilled non res' 0.978512
## job'unskilled resident' 0.298744
## jobskilled 0.230721
## num_dependents 0.482130
## own_telephoneyes 0.593026
## foreign_workeryes 0.024683 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 820.66 on 665 degrees of freedom
## Residual deviance: 585.99 on 617 degrees of freedom
## AIC: 683.99
##
## Number of Fisher Scoring iterations: 14
log_predictions<-predict(reg_log, newdata=test_set, type="response")
head(log_predictions, 10)
## 1 2 3 4 5 6 7 8
## 0.9773777 0.1646490 0.7652190 0.9638105 0.9241922 0.7625606 0.9924606 0.9465828
## 9 10
## 0.2368044 0.5862853
log_predictions1<-ifelse(log_predictions>0.5,1,0)
head(log_predictions1, 10)
## 1 2 3 4 5 6 7 8 9 10
## 1 0 1 1 1 1 1 1 0 1
y_pred_num <- ifelse(log_predictions > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
y_act <- test_set$class
mean(y_pred == y_act)
## [1] 0.754491