다항 로지스틱 회귀분석




패키지 불러오기
library(nnet)   # 다항 로지스틱 회귀모형 분석 패키지

파일 불러오기
autoparts <- read.csv("C:/Users/user/Desktop/JBTP/autoparts.csv", header=T)
dim(autoparts)
## [1] 34139    17
str(autoparts)
## 'data.frame':    34139 obs. of  17 variables:
##  $ prod_date        : Factor w/ 34139 levels "2014-03-01 오전 10:00:33",..: 34139 34138 34137 34136 34135 34134 34133 34132 34131 34130 ...
##  $ prod_no          : Factor w/ 6 levels "45231-3B400",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ prod_name        : Factor w/ 1 level "Oil Gasket": 1 1 1 1 1 1 1 1 1 1 ...
##  $ degree           : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ mold             : Factor w/ 3 levels "생산대기","승인대기",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ prod             : Factor w/ 1 level "생산": 1 1 1 1 1 1 1 1 1 1 ...
##  $ s_no             : int  892890 892889 892888 892887 892886 892885 892884 892883 892882 892881 ...
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...

파일 전처리
autoparts1 <- autoparts[autoparts$prod_no=="90784-76001", -c(1:7)]
dim(autoparts1)
## [1] 21779    10
# 데이터 탐색 및 이상치 제거
boxplot(autoparts1$c_thickness)

autoparts2 <- autoparts1[autoparts1$c_thickness < 1000, ]
dim(autoparts2)
## [1] 21767    10
str(autoparts2)
## 'data.frame':    21767 obs. of  10 variables:
##  $ fix_time         : num  85.5 86.2 86 86.1 86.1 86.3 86.5 86.4 86.3 86 ...
##  $ a_speed          : num  0.611 0.606 0.609 0.61 0.603 0.606 0.606 0.607 0.604 0.608 ...
##  $ b_speed          : num  1.72 1.71 1.72 1.72 1.7 ...
##  $ separation       : num  242 245 243 242 242 ...
##  $ s_separation     : num  658 657 658 657 657 ...
##  $ rate_terms       : int  95 95 95 95 95 95 95 95 95 95 ...
##  $ mpa              : num  78.2 77.9 78 78.2 77.9 77.9 78.2 77.5 77.8 77.5 ...
##  $ load_time        : num  18.1 18.2 18.1 18.1 18.2 18 18.1 18.1 18 18.1 ...
##  $ highpressure_time: int  58 58 82 74 56 78 55 57 50 60 ...
##  $ c_thickness      : num  24.7 22.5 24.1 25.1 24.5 22.9 24.3 23.9 22.2 19 ...
#종속변수 3개 범주로 나누기 
autoparts2$g_class <- as.factor(ifelse(autoparts2$c_thickness<20, 1, 
                                       ifelse(autoparts2$c_thickness<32, 2,3)))
table(autoparts2$g_class)
## 
##     1     2     3 
##  2141 18914   712
# train, test data 나누기
t_index <- sample(1:nrow(autoparts2), size=nrow(autoparts2)*0.7)
train <- autoparts2[t_index, ]
test <- autoparts2[-t_index, ]

nrow(autoparts2)
## [1] 21767
nrow(train); nrow(test)
## [1] 15236
## [1] 6531

다항 로짓회귀 분석
# train 모델 적재 분석
m <- multinom(g_class ~ fix_time+a_speed+b_speed+separation+s_separation+
                rate_terms+mpa+load_time+highpressure_time, data=train)
## # weights:  33 (20 variable)
## initial  value 16738.456830 
## iter  10 value 5090.149755
## iter  20 value 4823.926106
## iter  30 value 3911.105599
## iter  40 value 3116.363209
## iter  50 value 3030.297335
## iter  60 value 3002.166727
## iter  70 value 2540.157820
## iter  80 value 2041.578212
## iter  90 value 1873.288103
## iter 100 value 1866.447015
## final  value 1866.447015 
## stopped after 100 iterations
# 결과는 두개만 나타남(분류 2와 분류 3에 대한 y절편과 각 변수에 대한 계수 정보)
# 분류 1에 대한 정보는 따로 주지 않고 분류 2 또는 분류 3에 속하지 않는 모든 것을 분류 1로 간주
summary(m)
## Call:
## multinom(formula = g_class ~ fix_time + a_speed + b_speed + separation + 
##     s_separation + rate_terms + mpa + load_time + highpressure_time, 
##     data = train)
## 
## Coefficients:
##   (Intercept)  fix_time   a_speed  b_speed separation s_separation
## 2    1998.970 0.1266347 -13.18778 4.274698  -2.143214    -2.168558
## 3    2089.806 0.1263131 -24.02973 4.490718  -2.225249    -2.259454
##   rate_terms        mpa  load_time highpressure_time
## 2 0.11214178 -0.7843368 -0.1546241      0.0003102251
## 3 0.09587381 -0.9098402 -0.1771477      0.0003408614
## 
## Std. Errors:
##    (Intercept)   fix_time      a_speed      b_speed  separation
## 2 1.205688e-05 0.02005001 0.0008699763 1.325469e-04 0.003503471
## 3 1.275855e-05 0.02265064 0.0001787082 4.335144e-05 0.004180400
##   s_separation rate_terms        mpa  load_time highpressure_time
## 2  0.002644795 0.01523946 0.01581643 0.06149071      9.069647e-05
## 3  0.003265422 0.02312659 0.01674634 0.06332719      9.564052e-05
## 
## Residual Deviance: 3732.894 
## AIC: 3772.894

test data 예측
# test 값 예측하기
yhat_test <- predict(m, test)
# accuray 체크
table <- table(real=test$g_class, predict=yhat_test)
table
##     predict
## real    1    2    3
##    1  502  130    2
##    2   80 5517   62
##    3    2   60  176