## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Logistic Regression

Logistic Regression에 대해서 알아보자
주요 내용

먼저 테스트 데이터를 활용해 보자
아마도 R 내부에 있는 자료중에서는 mtcars 가장 좋을것 같다.

Test Dataset - mtcars

Motor Trend Car Road Tests

Description

The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and
10 aspects of automobile design and performance for 32 automobiles (1973–74 models).

A data frame with 32 observations on 11 variables.

[, 1] mpg Miles/(US) gallon
[, 2] cyl Number of cylinders
[, 3] disp Displacement (cu.in.)
[, 4] hp Gross horsepower
[, 5] drat Rear axle ratio
[, 6] wt Weight (1000 lbs)
[, 7] qsec 1/4 mile time
[, 8] vs V/S
[, 9] am Transmission (0 = automatic, 1 = manual)
[,10] gear Number of forward gears
[,11] carb Number of carburetors

Logistic Regression - Example

data(mtcars)
glimpse(mtcars)

## Observations: 32
## Variables: 11
## $ mpg  (dbl) 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl  (dbl) 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp (dbl) 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp   (dbl) 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat (dbl) 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt   (dbl) 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec (dbl) 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs   (dbl) 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am   (dbl) 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear (dbl) 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb (dbl) 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...

vTmp01 <- mtcars %>% select(mpg,vs, cyl, disp, hp, wt)
#vTmp01 <- mtcars %>% select(mpg,vs,am)
vTmp01

##                      mpg vs cyl  disp  hp    wt
## Mazda RX4           21.0  0   6 160.0 110 2.620
## Mazda RX4 Wag       21.0  0   6 160.0 110 2.875
## Datsun 710          22.8  1   4 108.0  93 2.320
## Hornet 4 Drive      21.4  1   6 258.0 110 3.215
## Hornet Sportabout   18.7  0   8 360.0 175 3.440
## Valiant             18.1  1   6 225.0 105 3.460
## Duster 360          14.3  0   8 360.0 245 3.570
## Merc 240D           24.4  1   4 146.7  62 3.190
## Merc 230            22.8  1   4 140.8  95 3.150
## Merc 280            19.2  1   6 167.6 123 3.440
## Merc 280C           17.8  1   6 167.6 123 3.440
## Merc 450SE          16.4  0   8 275.8 180 4.070
## Merc 450SL          17.3  0   8 275.8 180 3.730
## Merc 450SLC         15.2  0   8 275.8 180 3.780
## Cadillac Fleetwood  10.4  0   8 472.0 205 5.250
## Lincoln Continental 10.4  0   8 460.0 215 5.424
## Chrysler Imperial   14.7  0   8 440.0 230 5.345
## Fiat 128            32.4  1   4  78.7  66 2.200
## Honda Civic         30.4  1   4  75.7  52 1.615
## Toyota Corolla      33.9  1   4  71.1  65 1.835
## Toyota Corona       21.5  1   4 120.1  97 2.465
## Dodge Challenger    15.5  0   8 318.0 150 3.520
## AMC Javelin         15.2  0   8 304.0 150 3.435
## Camaro Z28          13.3  0   8 350.0 245 3.840
## Pontiac Firebird    19.2  0   8 400.0 175 3.845
## Fiat X1-9           27.3  1   4  79.0  66 1.935
## Porsche 914-2       26.0  0   4 120.3  91 2.140
## Lotus Europa        30.4  1   4  95.1 113 1.513
## Ford Pantera L      15.8  0   8 351.0 264 3.170
## Ferrari Dino        19.7  0   6 145.0 175 2.770
## Maserati Bora       15.0  0   8 301.0 335 3.570
## Volvo 142E          21.4  1   4 121.0 109 2.780

Make Model of logistic

vLogicModel01 <- glm(vs ~ mpg+cyl+disp+hp+wt, data=vTmp01, family=binomial)
#vLogicModel01 <- glm(vs ~ mpg+am, data=vTmp01, family=binomial)

Result

vLogicModel01

## 
## Call:  glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial, 
##     data = vTmp01)
## 
## Coefficients:
## (Intercept)          mpg          cyl         disp           hp  
##   -14.28582      0.60334     -1.55312     -0.02041     -0.05938  
##          wt  
##     7.17654  
## 
## Degrees of Freedom: 31 Total (i.e. Null);  26 Residual
## Null Deviance:       43.86 
## Residual Deviance: 11.56     AIC: 23.56

summary(vLogicModel01)

## 
## Call:
## glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial, 
##     data = vTmp01)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.34645  -0.08400  -0.00066   0.18483   1.15618  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.28582   25.63753  -0.557    0.577
## mpg           0.60334    0.70383   0.857    0.391
## cyl          -1.55312    1.50604  -1.031    0.302
## disp         -0.02041    0.02737  -0.746    0.456
## hp           -0.05938    0.04668  -1.272    0.203
## wt            7.17654    5.48145   1.309    0.190
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.860  on 31  degrees of freedom
## Residual deviance: 11.561  on 26  degrees of freedom
## AIC: 23.561
## 
## Number of Fisher Scoring iterations: 8

Return to Log

vReturn_01<- predict(vLogicModel01, newdata=vTmp01, type="response")

llcomponents<- function(y, py){
  y*log(py) + (1-y)*log(1-py)
}
vReturn_02<- sign(as.numeric(vTmp01$vs) - vReturn_01) * sqrt(-2*llcomponents(as.numeric(vTmp01$vs), vReturn_01))
summary(vReturn_02)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -2.3460000 -0.0840000 -0.0006645  0.0076510  0.1848000  1.1560000

vReturn_99<-cbind(data.frame(round(vReturn_01,digits = 2)),vTmp01)

Classification Table

Logistic Regression 실제 잘 판별이 되었는지 파악함

vReturn_01.class = ifelse(vReturn_01>0.5, "R", "M")
table(Class=vTmp01$vs, pred=vReturn_01.class)

##      pred
## Class  M  R
##     0 17  1
##     1  0 14

mean(vTmp01$vs != vReturn_01.class)

## [1] 1

Verification No.0

AIC (Akaike Information Criterion)
AIC는 작을 수록 좋음 이는 모형의 적합도와 간단 명료성을 동시에 고려하는 지수임

summary(vLogicModel01)

## 
## Call:
## glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial, 
##     data = vTmp01)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.34645  -0.08400  -0.00066   0.18483   1.15618  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.28582   25.63753  -0.557    0.577
## mpg           0.60334    0.70383   0.857    0.391
## cyl          -1.55312    1.50604  -1.031    0.302
## disp         -0.02041    0.02737  -0.746    0.456
## hp           -0.05938    0.04668  -1.272    0.203
## wt            7.17654    5.48145   1.309    0.190
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.860  on 31  degrees of freedom
## Residual deviance: 11.561  on 26  degrees of freedom
## AIC: 23.561
## 
## Number of Fisher Scoring iterations: 8

Verification No.1

ROC 곡선이 reference line인 1번 라인에서 2번 라인으로 가까워 질 수록 모델의 정확도는 향상
ROC곡선 [참조 : http://www.dodomira.com/]

library(Deducer)

## Loading required package: ggplot2

## Loading required package: JGR

## Loading required package: rJava

## Loading required package: JavaGD

## Loading required package: iplots

## 
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.

## Loading required package: car

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## 
## 
## Note Non-JGR console detected:
##  Deducer is best used from within JGR (http://jgr.markushelbig.org/).
##  To Bring up GUI dialogs, type deducer().

rocplot(vLogicModel01)

Verification No.2

Hosmerdhk Lemeshow의검정의 카이제곱 값은 로지스틱 회귀모형의 전체적인 적합도를 나타내는 값이다.
이 값은 작을 수록 모형의 적합도는 높다.

#install.packages("ResourceSelection")
library(ResourceSelection)

## ResourceSelection 0.2-6   2016-02-15

vCheck<-hoslem.test(vTmp01$vs, fitted(vLogicModel01), g=10)
cbind(vCheck$observed,vCheck$expected)

##                     y0 y1       yhat0        yhat1
## [1.41e-08,2.32e-05]  4  0 3.999996670 3.329535e-06
## (2.32e-05,0.000682]  3  0 2.998902800 1.097200e-03
## (0.000682,0.00373]   3  0 2.993596439 6.403561e-03
## (0.00373,0.0216]     3  0 2.964838208 3.516179e-02
## (0.0216,0.301]       3  0 2.741358277 2.586417e-01
## (0.301,0.706]        1  2 1.371064633 1.628935e+00
## (0.706,0.875]        0  3 0.586695966 2.413304e+00
## (0.875,0.962]        1  2 0.289567335 2.710433e+00
## (0.962,0.999]        0  3 0.052812776 2.947187e+00
## (0.999,1]            0  4 0.001166893 3.998833e+00

끝인사

R Logo

데이터 예제로 도움이 되었으면 합니다.
모두가 R를 잘 사용하는 그날까지..

Thanks & best regards

DataAnalysis-LogisticRegression

good4ram

2016년 7월 27일