##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Logistic Regression에 대해서 알아보자
주요 내용
먼저 테스트 데이터를 활용해 보자
아마도 R 내부에 있는 자료중에서는 mtcars 가장 좋을것 같다.
Motor Trend Car Road Tests
Description
The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and
10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
A data frame with 32 observations on 11 variables.
[, 1] mpg Miles/(US) gallon
[, 2] cyl Number of cylinders
[, 3] disp Displacement (cu.in.)
[, 4] hp Gross horsepower
[, 5] drat Rear axle ratio
[, 6] wt Weight (1000 lbs)
[, 7] qsec 1/4 mile time
[, 8] vs V/S
[, 9] am Transmission (0 = automatic, 1 = manual)
[,10] gear Number of forward gears
[,11] carb Number of carburetors
data(mtcars)
glimpse(mtcars)
## Observations: 32
## Variables: 11
## $ mpg (dbl) 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19....
## $ cyl (dbl) 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, ...
## $ disp (dbl) 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1...
## $ hp (dbl) 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, ...
## $ drat (dbl) 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9...
## $ wt (dbl) 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3...
## $ qsec (dbl) 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2...
## $ vs (dbl) 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ am (dbl) 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ gear (dbl) 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, ...
## $ carb (dbl) 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, ...
vTmp01 <- mtcars %>% select(mpg,vs, cyl, disp, hp, wt)
#vTmp01 <- mtcars %>% select(mpg,vs,am)
vTmp01
## mpg vs cyl disp hp wt
## Mazda RX4 21.0 0 6 160.0 110 2.620
## Mazda RX4 Wag 21.0 0 6 160.0 110 2.875
## Datsun 710 22.8 1 4 108.0 93 2.320
## Hornet 4 Drive 21.4 1 6 258.0 110 3.215
## Hornet Sportabout 18.7 0 8 360.0 175 3.440
## Valiant 18.1 1 6 225.0 105 3.460
## Duster 360 14.3 0 8 360.0 245 3.570
## Merc 240D 24.4 1 4 146.7 62 3.190
## Merc 230 22.8 1 4 140.8 95 3.150
## Merc 280 19.2 1 6 167.6 123 3.440
## Merc 280C 17.8 1 6 167.6 123 3.440
## Merc 450SE 16.4 0 8 275.8 180 4.070
## Merc 450SL 17.3 0 8 275.8 180 3.730
## Merc 450SLC 15.2 0 8 275.8 180 3.780
## Cadillac Fleetwood 10.4 0 8 472.0 205 5.250
## Lincoln Continental 10.4 0 8 460.0 215 5.424
## Chrysler Imperial 14.7 0 8 440.0 230 5.345
## Fiat 128 32.4 1 4 78.7 66 2.200
## Honda Civic 30.4 1 4 75.7 52 1.615
## Toyota Corolla 33.9 1 4 71.1 65 1.835
## Toyota Corona 21.5 1 4 120.1 97 2.465
## Dodge Challenger 15.5 0 8 318.0 150 3.520
## AMC Javelin 15.2 0 8 304.0 150 3.435
## Camaro Z28 13.3 0 8 350.0 245 3.840
## Pontiac Firebird 19.2 0 8 400.0 175 3.845
## Fiat X1-9 27.3 1 4 79.0 66 1.935
## Porsche 914-2 26.0 0 4 120.3 91 2.140
## Lotus Europa 30.4 1 4 95.1 113 1.513
## Ford Pantera L 15.8 0 8 351.0 264 3.170
## Ferrari Dino 19.7 0 6 145.0 175 2.770
## Maserati Bora 15.0 0 8 301.0 335 3.570
## Volvo 142E 21.4 1 4 121.0 109 2.780
vLogicModel01 <- glm(vs ~ mpg+cyl+disp+hp+wt, data=vTmp01, family=binomial)
#vLogicModel01 <- glm(vs ~ mpg+am, data=vTmp01, family=binomial)
vLogicModel01
##
## Call: glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial,
## data = vTmp01)
##
## Coefficients:
## (Intercept) mpg cyl disp hp
## -14.28582 0.60334 -1.55312 -0.02041 -0.05938
## wt
## 7.17654
##
## Degrees of Freedom: 31 Total (i.e. Null); 26 Residual
## Null Deviance: 43.86
## Residual Deviance: 11.56 AIC: 23.56
summary(vLogicModel01)
##
## Call:
## glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial,
## data = vTmp01)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.34645 -0.08400 -0.00066 0.18483 1.15618
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.28582 25.63753 -0.557 0.577
## mpg 0.60334 0.70383 0.857 0.391
## cyl -1.55312 1.50604 -1.031 0.302
## disp -0.02041 0.02737 -0.746 0.456
## hp -0.05938 0.04668 -1.272 0.203
## wt 7.17654 5.48145 1.309 0.190
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.860 on 31 degrees of freedom
## Residual deviance: 11.561 on 26 degrees of freedom
## AIC: 23.561
##
## Number of Fisher Scoring iterations: 8
vReturn_01<- predict(vLogicModel01, newdata=vTmp01, type="response")
llcomponents<- function(y, py){
y*log(py) + (1-y)*log(1-py)
}
vReturn_02<- sign(as.numeric(vTmp01$vs) - vReturn_01) * sqrt(-2*llcomponents(as.numeric(vTmp01$vs), vReturn_01))
summary(vReturn_02)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3460000 -0.0840000 -0.0006645 0.0076510 0.1848000 1.1560000
vReturn_99<-cbind(data.frame(round(vReturn_01,digits = 2)),vTmp01)
Logistic Regression 실제 잘 판별이 되었는지 파악함
vReturn_01.class = ifelse(vReturn_01>0.5, "R", "M")
table(Class=vTmp01$vs, pred=vReturn_01.class)
## pred
## Class M R
## 0 17 1
## 1 0 14
mean(vTmp01$vs != vReturn_01.class)
## [1] 1
AIC (Akaike Information Criterion)
AIC는 작을 수록 좋음 이는 모형의 적합도와 간단 명료성을 동시에 고려하는 지수임
summary(vLogicModel01)
##
## Call:
## glm(formula = vs ~ mpg + cyl + disp + hp + wt, family = binomial,
## data = vTmp01)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.34645 -0.08400 -0.00066 0.18483 1.15618
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.28582 25.63753 -0.557 0.577
## mpg 0.60334 0.70383 0.857 0.391
## cyl -1.55312 1.50604 -1.031 0.302
## disp -0.02041 0.02737 -0.746 0.456
## hp -0.05938 0.04668 -1.272 0.203
## wt 7.17654 5.48145 1.309 0.190
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.860 on 31 degrees of freedom
## Residual deviance: 11.561 on 26 degrees of freedom
## AIC: 23.561
##
## Number of Fisher Scoring iterations: 8
ROC 곡선이 reference line인 1번 라인에서 2번 라인으로 가까워 질 수록 모델의 정확도는 향상 [참조 : http://www.dodomira.com/]
library(Deducer)
## Loading required package: ggplot2
## Loading required package: JGR
## Loading required package: rJava
## Loading required package: JavaGD
## Loading required package: iplots
##
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Note Non-JGR console detected:
## Deducer is best used from within JGR (http://jgr.markushelbig.org/).
## To Bring up GUI dialogs, type deducer().
rocplot(vLogicModel01)
Hosmerdhk Lemeshow의검정의 카이제곱 값은 로지스틱 회귀모형의 전체적인 적합도를 나타내는 값이다.
이 값은 작을 수록 모형의 적합도는 높다.
#install.packages("ResourceSelection")
library(ResourceSelection)
## ResourceSelection 0.2-6 2016-02-15
vCheck<-hoslem.test(vTmp01$vs, fitted(vLogicModel01), g=10)
cbind(vCheck$observed,vCheck$expected)
## y0 y1 yhat0 yhat1
## [1.41e-08,2.32e-05] 4 0 3.999996670 3.329535e-06
## (2.32e-05,0.000682] 3 0 2.998902800 1.097200e-03
## (0.000682,0.00373] 3 0 2.993596439 6.403561e-03
## (0.00373,0.0216] 3 0 2.964838208 3.516179e-02
## (0.0216,0.301] 3 0 2.741358277 2.586417e-01
## (0.301,0.706] 1 2 1.371064633 1.628935e+00
## (0.706,0.875] 0 3 0.586695966 2.413304e+00
## (0.875,0.962] 1 2 0.289567335 2.710433e+00
## (0.962,0.999] 0 3 0.052812776 2.947187e+00
## (0.999,1] 0 4 0.001166893 3.998833e+00
데이터 예제로 도움이 되었으면 합니다.
모두가 R를 잘 사용하는 그날까지..
Thanks & best regards