로지스틱 회귀분석과 성능평가(ROC)


Logistic Regression


패키지 불러오기
library(Epi)   # ROC 패키지

파일 불러오기
autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)
## [1] 34139    17
str(autoparts)
## 'data.frame':    34139 obs. of  17 variables:
##  $ prod_date        : Factor w/ 34139 levels "2014-03-01 오전 10:00:33",..: 34139 34138 34137 34136 34135 34134 34133 34132 34131 34130 ...
##  $ prod_no          : Factor w/ 6 levels "45231-3B400",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ prod_name        : Factor w/ 1 level "Oil Gasket": 1 1 1 1 1 1 1 1 1 1 ...
##  $ degree           : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ mold             : Factor w/ 3 levels "생산대기","승인대기",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ prod             : Factor w/ 1 level "생산": 1 1 1 1 1 1 1 1 1 1 ...
##  $ s_no             : int  892890 892889 892888 892887 892886 892885 892884 892883 892882 892881 ...
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...

파일 전처리
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779    10
# 데이터 탐색 및 이상치 제거
boxplot(autoparts1$c_thickness)

autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]
dim(autoparts2)
## [1] 21767    10
str(autoparts2)
## 'data.frame':    21767 obs. of  10 variables:
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
# 종속변수를 범주형으로 나눔(로짓분석을 위해)
autoparts2$y_faulty <- ifelse((autoparts2$c_thickness<20)|(autoparts2$c_thickness>32),1,0)
table(autoparts2$y_faulty)
## 
##     0     1 
## 18925  2842
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]

nrow(autoparts2)
## [1] 21767
nrow(train); nrow(test)
## [1] 15236
## [1] 6531

분석(로짓회귀)
# train 모델 적재
m <- glm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
           rate_terms+mpa+load_time+highpressure_time, data=train, family = binomial(logit))

summary(m)
## 
## Call:
## glm(formula = factor(y_faulty) ~ fix_time + a_speed + b_speed + 
##     separation + s_separation + rate_terms + mpa + load_time + 
##     highpressure_time, family = binomial(logit), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.7566  -0.3740  -0.2147  -0.1191   5.1628  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -4.513e+02  1.192e+01 -37.857  < 2e-16 ***
## fix_time          -1.858e-02  1.146e-02  -1.621  0.10504    
## a_speed            2.025e+01  1.133e+00  17.868  < 2e-16 ***
## b_speed           -2.234e+00  4.865e-01  -4.593 4.37e-06 ***
## separation         5.315e-01  1.337e-02  39.761  < 2e-16 ***
## s_separation       4.962e-01  1.305e-02  38.020  < 2e-16 ***
## rate_terms        -2.305e-02  8.206e-03  -2.809  0.00497 ** 
## mpa               -1.429e-01  4.024e-03 -35.518  < 2e-16 ***
## load_time         -3.240e-02  1.747e-02  -1.855  0.06356 .  
## highpressure_time  2.199e-04  1.802e-05  12.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 11885.0  on 15235  degrees of freedom
## Residual deviance:  7022.8  on 15226  degrees of freedom
## AIC: 7042.8
## 
## Number of Fisher Scoring iterations: 6
# 예측값 문턱값 0.5로 설정
# 0.5 이상이면 1, 그외 0
head(m$fitted.values)
##        8405       15347        4517        2832       23428        8452 
## 0.006718045 0.027264075 0.079713643 0.278308675 0.024400265 0.182423409
yhat <- ifelse(m$fitted.values>=0.5, 1, 0)

# 결과 빈도 보기
table(real=train$y_faulty, pred=yhat)
##     pred
## real     0     1
##    0 12997   229
##    1  1112   898

test data 예측
# test 값 예측하기
yhat_test <- predict(m,test)
summary(yhat_test)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -11.111  -4.252  -3.233  -2.852  -1.865  14.304
head(yhat_test)
##          9         14         24         29         31         33 
## -2.1370000  0.4042014  0.9878204 -1.0262440 -3.1577508 -1.0381084

모델 성능 측정방법

ROC(Receiver Operating Characteristic)

  • 이산형(범주형) 종속변수 데이터의 경우 confusion matrix 결과를 ROC 커브를 통해 성능이 어떤지 평가함
  • 확률값(여기에서는 문턱값) 중 어느 정도가 되었을때 yes로 볼 것인지를 결정해야 한다. 높은 기준값(문턱값)에서도 TP가 높다면 좋은 모델이다. 문턱값이 높은데도 불고하고 많은 행이 실제로 True고, 모델도 true로 예측했기 때문이다
  • AUC (Area under the curve) ROC 커브의 아래 면적을 뜻하며 1일때 이상적인 모델이다

ROC 실행
ROC(test=yhat_test, stat=test$y_faulty, plot="ROC", AUC=T, main="logistic regression")

***