Logistic Regression
library(Epi) # ROC 패키지
autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)
## [1] 34139 17
str(autoparts)
## 'data.frame': 34139 obs. of 17 variables:
## $ prod_date : Factor w/ 34139 levels "2014-03-01 오전 10:00:33",..: 34139 34138 34137 34136 34135 34134 34133 34132 34131 34130 ...
## $ prod_no : Factor w/ 6 levels "45231-3B400",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ prod_name : Factor w/ 1 level "Oil Gasket": 1 1 1 1 1 1 1 1 1 1 ...
## $ degree : int 2 2 2 2 2 2 2 2 2 2 ...
## $ mold : Factor w/ 3 levels "생산대기","승인대기",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ prod : Factor w/ 1 level "생산": 1 1 1 1 1 1 1 1 1 1 ...
## $ s_no : int 892890 892889 892888 892887 892886 892885 892884 892883 892882 892881 ...
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779 10
# 데이터 탐색 및 이상치 제거
boxplot(autoparts1$c_thickness)
autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]
dim(autoparts2)
## [1] 21767 10
str(autoparts2)
## 'data.frame': 21767 obs. of 10 variables:
## $ fix_time : num 85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
## $ a_speed : num 0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
## $ b_speed : num 1.72 1.71 1.72 1.72 1.7 ...
## $ separation : num 242 245 243 242 242 ...
## $ s_separation : num 658 657 658 657 657 ...
## $ rate_terms : int 95 95 95 95 95 95 95 95 95 95 ...
## $ mpa : num 78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
## $ load_time : num 18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
## $ highpressure_time: int 58 58 82 74 56 78 55 57 50 60 ...
## $ c_thickness : num 24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
# 종속변수를 범주형으로 나눔(로짓분석을 위해)
autoparts2$y_faulty <- ifelse((autoparts2$c_thickness<20)|(autoparts2$c_thickness>32),1,0)
table(autoparts2$y_faulty)
##
## 0 1
## 18925 2842
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]
nrow(autoparts2)
## [1] 21767
nrow(train); nrow(test)
## [1] 15236
## [1] 6531
# train 모델 적재
m <- glm(factor(y_faulty) ~ fix_time+a_speed+b_speed+separation+s_separation+
rate_terms+mpa+load_time+highpressure_time, data=train, family = binomial(logit))
summary(m)
##
## Call:
## glm(formula = factor(y_faulty) ~ fix_time + a_speed + b_speed +
## separation + s_separation + rate_terms + mpa + load_time +
## highpressure_time, family = binomial(logit), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7566 -0.3740 -0.2147 -0.1191 5.1628
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.513e+02 1.192e+01 -37.857 < 2e-16 ***
## fix_time -1.858e-02 1.146e-02 -1.621 0.10504
## a_speed 2.025e+01 1.133e+00 17.868 < 2e-16 ***
## b_speed -2.234e+00 4.865e-01 -4.593 4.37e-06 ***
## separation 5.315e-01 1.337e-02 39.761 < 2e-16 ***
## s_separation 4.962e-01 1.305e-02 38.020 < 2e-16 ***
## rate_terms -2.305e-02 8.206e-03 -2.809 0.00497 **
## mpa -1.429e-01 4.024e-03 -35.518 < 2e-16 ***
## load_time -3.240e-02 1.747e-02 -1.855 0.06356 .
## highpressure_time 2.199e-04 1.802e-05 12.205 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 11885.0 on 15235 degrees of freedom
## Residual deviance: 7022.8 on 15226 degrees of freedom
## AIC: 7042.8
##
## Number of Fisher Scoring iterations: 6
# 예측값 문턱값 0.5로 설정
# 0.5 이상이면 1, 그외 0
head(m$fitted.values)
## 8405 15347 4517 2832 23428 8452
## 0.006718045 0.027264075 0.079713643 0.278308675 0.024400265 0.182423409
yhat <- ifelse(m$fitted.values>=0.5, 1, 0)
# 결과 빈도 보기
table(real=train$y_faulty, pred=yhat)
## pred
## real 0 1
## 0 12997 229
## 1 1112 898
# test 값 예측하기
yhat_test <- predict(m,test)
summary(yhat_test)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.111 -4.252 -3.233 -2.852 -1.865 14.304
head(yhat_test)
## 9 14 24 29 31 33
## -2.1370000 0.4042014 0.9878204 -1.0262440 -3.1577508 -1.0381084
ROC(Receiver Operating Characteristic)
ROC(test=yhat_test, stat=test$y_faulty, plot="ROC", AUC=T, main="logistic regression")
***