다항 로지스틱 회귀분석


패키지 불러오기
# echo=TRUE : 코드를 평가하고 실행결과를 포함한다
# eval=TRUE : 실행결과와 함게 코드를 출력한다
# message=FALSE : 메시지를 출력한다
# warning=TRUE : 경고메시지를 출력한다 
# error=FALSE : 오류메시지를 출력한다 
# tidy=FALSE : 깔끔한 방식으로 코드 형태를 변형한다 

library(readxl)
library(corrplot)
library(mlogit)
library(ggplot2)

데이터 불러오기
setwd("C:/Users/user/Desktop/krivet")
edu <- read_excel("data_merge_3_na.xlsx")

# 종속변수= satisfaction55, 설명변수 21개
str(edu)
## Classes 'tbl_df', 'tbl' and 'data.frame':    312 obs. of  22 variables:
##  $ satisfaction55: num  4 4 4 5 4 4 4 4 2 3 ...
##  $ facilities79_1: num  4 4 4 4 3 1 4 3 1 3 ...
##  $ facilities80_2: num  4 4 4 2 3 2 3 4 1 3 ...
##  $ facilities81_3: num  4 4 5 3 3 3 3 4 1 3 ...
##  $ facilities82_4: num  4 4 5 3 3 2 3 4 1 3 ...
##  $ ralation84_1  : num  4 4 4 3 3 1 3 4 1 3 ...
##  $ pride85_1     : num  4 4 4 3 4 3 4 4 1 3 ...
##  $ pride86_2     : num  4 4 5 3 4 3 4 4 1 3 ...
##  $ schoollife87_1: num  4 4 4 3 4 4 2 4 1 3 ...
##  $ schoollife88_2: num  4 4 4 3 3 1 4 4 1 3 ...
##  $ schoollife89_3: num  3 4 4 3 3 2 4 4 1 3 ...
##  $ schoollife90_4: num  4 4 3 3 4 3 4 4 1 3 ...
##  $ schoollife91_5: num  4 4 4 3 4 4 4 4 1 3 ...
##  $ schoollife92_6: num  4 4 4 3 4 4 4 4 3 4 ...
##  $ schoollife93_7: num  4 4 4 1 3 3 3 4 1 3 ...
##  $ class94_1     : num  4 4 4 3 4 4 4 4 3 3 ...
##  $ class95_2     : num  4 4 4 3 4 4 4 4 2 3 ...
##  $ class96_3     : num  4 4 4 3 4 3 4 4 2 3 ...
##  $ class97_4     : num  4 4 4 3 4 3 4 4 2 4 ...
##  $ class98_5     : num  4 4 4 3 4 5 4 4 3 4 ...
##  $ class99_6     : num  4 4 4 3 4 5 3 4 3 4 ...
##  $ prof102_1     : num  1 1 2 2 1 1 1 1 1 1 ...
summary(edu)
##  satisfaction55  facilities79_1  facilities80_2  facilities81_3 
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.872   Mean   :3.583   Mean   :3.506   Mean   :3.609  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  facilities82_4   ralation84_1     pride85_1       pride86_2    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :3.000   Median :4.000  
##  Mean   :3.622   Mean   :3.712   Mean   :3.465   Mean   :3.679  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  schoollife87_1  schoollife88_2  schoollife89_3  schoollife90_4 
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :3.000   Median :3.000   Median :3.000   Median :4.000  
##  Mean   :3.359   Mean   :3.442   Mean   :3.279   Mean   :3.583  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##  schoollife91_5  schoollife92_6  schoollife93_7    class94_1    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :4.000   Median :3.000   Median :4.000  
##  Mean   :3.388   Mean   :3.619   Mean   :2.955   Mean   :3.766  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##    class95_2       class96_3       class97_4       class98_5    
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
##  Median :4.000   Median :4.000   Median :4.000   Median :4.000  
##  Mean   :3.756   Mean   :3.846   Mean   :3.798   Mean   :3.715  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##    class99_6       prof102_1    
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:1.000  
##  Median :4.000   Median :2.000  
##  Mean   :3.827   Mean   :1.721  
##  3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :5.000

상관계수
# 분석에 사용된 변수만 선택(14개)
edu[ ,-c(2,6,12,13,14,15,22)]
## # A tibble: 312 x 15
##    satisfaction55 facilities80_2 facilities81_3 facilities82_4 pride85_1
##             <dbl>          <dbl>          <dbl>          <dbl>     <dbl>
##  1              4              4              4              4         4
##  2              4              4              4              4         4
##  3              4              4              5              5         4
##  4              5              2              3              3         3
##  5              4              3              3              3         4
##  6              4              2              3              2         3
##  7              4              3              3              3         4
##  8              4              4              4              4         4
##  9              2              1              1              1         1
## 10              3              3              3              3         3
## # ... with 302 more rows, and 10 more variables: pride86_2 <dbl>,
## #   schoollife87_1 <dbl>, schoollife88_2 <dbl>, schoollife89_3 <dbl>,
## #   class94_1 <dbl>, class95_2 <dbl>, class96_3 <dbl>, class97_4 <dbl>,
## #   class98_5 <dbl>, class99_6 <dbl>
correl <- cor(edu[ ,-c(2,6,12,13,14,15,22)])  

# "circle", "square", "ellipse", "number", "shade","color", "pie"
corrplot(correl, method = c("shade")) 

# 상관계수 + 컬러
corrplot.mixed(correl)

cor.test(edu$satisfaction55, edu$class96_3)
## 
##  Pearson's product-moment correlation
## 
## data:  edu$satisfaction55 and edu$class96_3
## t = 7.8883, df = 310, p-value = 5.293e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3119928 0.4973270
## sample estimates:
##      cor 
## 0.408867

데이터 형태 변환 및 분석
# factor the dv(종속변수 범주형으로 변환) 
edu$satisfaction55 <- factor(edu$satisfaction55, level=c(1:5), label=c("sobad","bad","normal","good","sogood"))

# restructure(데이터 형태 변환)
longdata <- mlogit.data(edu, choice = "satisfaction55", shape="wide")

# run 분석(설명변수 14개 선택)
model <- mlogit(satisfaction55 ~ 1 | facilities80_2+facilities81_3+facilities82_4+
                  pride85_1+pride86_2+schoollife87_1+schoollife88_2+schoollife89_3+
                  class94_1+class95_2+class96_3+class97_4+class98_5+class99_6,
                data=longdata, reflevel="bad")       #reflevel 는 base 설정하기 위한 것
summary(model)
## 
## Call:
## mlogit(formula = satisfaction55 ~ 1 | facilities80_2 + facilities81_3 + 
##     facilities82_4 + pride85_1 + pride86_2 + schoollife87_1 + 
##     schoollife88_2 + schoollife89_3 + class94_1 + class95_2 + 
##     class96_3 + class97_4 + class98_5 + class99_6, data = longdata, 
##     reflevel = "bad", method = "nr", print.level = 0)
## 
## Frequencies of alternatives:
##       bad      good    normal     sobad    sogood 
## 0.0352564 0.5608974 0.2243590 0.0032051 0.1762821 
## 
## nr method
## 9 iterations, 0h:0m:0s 
## g'(-H)^-1g = 2.67E-08 
## gradient close to zero 
## 
## Coefficients :
##                          Estimate  Std. Error z-value  Pr(>|z|)    
## good:(intercept)      -7.8699e+00  2.7281e+00 -2.8847  0.003917 ** 
## normal:(intercept)    -2.4103e+00  2.6440e+00 -0.9116  0.361977    
## sobad:(intercept)     -5.3357e-01  8.0147e+00 -0.0666  0.946920    
## sogood:(intercept)    -1.6591e+01  3.1105e+00 -5.3340 9.609e-08 ***
## good:facilities80_2    3.0305e-01  6.8533e-01  0.4422  0.658344    
## normal:facilities80_2 -1.4745e-01  6.9335e-01 -0.2127  0.831585    
## sobad:facilities80_2   2.9592e-01  2.1783e+00  0.1359  0.891940    
## sogood:facilities80_2  2.3565e-01  7.1467e-01  0.3297  0.741600    
## good:facilities81_3   -1.1685e+00  7.0196e-01 -1.6646  0.095983 .  
## normal:facilities81_3 -1.2980e+00  7.1710e-01 -1.8101  0.070275 .  
## sobad:facilities81_3  -1.8614e+00  2.3063e+00 -0.8071  0.419594    
## sogood:facilities81_3 -1.0584e+00  7.4185e-01 -1.4267  0.153654    
## good:facilities82_4    1.2637e+00  6.1596e-01  2.0516  0.040204 *  
## normal:facilities82_4  1.5309e+00  6.3070e-01  2.4274  0.015209 *  
## sobad:facilities82_4   9.4720e-01  1.9749e+00  0.4796  0.631494    
## sogood:facilities82_4  1.0745e+00  6.4398e-01  1.6686  0.095203 .  
## good:pride85_1         3.3963e-02  9.5730e-01  0.0355  0.971699    
## normal:pride85_1       2.7938e-01  9.7360e-01  0.2870  0.774146    
## sobad:pride85_1        6.4888e-01  3.2331e+00  0.2007  0.840933    
## sogood:pride85_1       4.6274e-03  9.8265e-01  0.0047  0.996243    
## good:pride86_2         1.9475e+00  8.7676e-01  2.2213  0.026333 *  
## normal:pride86_2       1.2242e+00  8.9091e-01  1.3741  0.169398    
## sobad:pride86_2        5.5270e-01  2.9112e+00  0.1899  0.849423    
## sogood:pride86_2       2.5139e+00  9.2104e-01  2.7294  0.006345 ** 
## good:schoollife87_1   -4.2881e-01  7.8327e-01 -0.5475  0.584060    
## normal:schoollife87_1 -2.9809e-01  8.0548e-01 -0.3701  0.711326    
## sobad:schoollife87_1  -1.2389e-02  2.4764e+00 -0.0050  0.996008    
## sogood:schoollife87_1 -4.1482e-01  8.2230e-01 -0.5045  0.613937    
## good:schoollife88_2   -2.3139e-01  7.9946e-01 -0.2894  0.772250    
## normal:schoollife88_2 -5.5247e-01  8.1345e-01 -0.6792  0.497028    
## sobad:schoollife88_2  -4.5565e-01  2.6493e+00 -0.1720  0.863447    
## sogood:schoollife88_2  1.9863e-02  8.5194e-01  0.0233  0.981399    
## good:schoollife89_3   -1.1345e+00  7.3996e-01 -1.5333  0.125214    
## normal:schoollife89_3 -1.0162e+00  7.6255e-01 -1.3326  0.182658    
## sobad:schoollife89_3   1.6310e-01  2.6384e+00  0.0618  0.950706    
## sogood:schoollife89_3 -9.2099e-01  7.7462e-01 -1.1890  0.234455    
## good:class94_1        -8.5247e-01  1.0845e+00 -0.7860  0.431852    
## normal:class94_1      -1.6647e+00  1.1024e+00 -1.5101  0.131011    
## sobad:class94_1       -9.2264e-01  3.4115e+00 -0.2705  0.786814    
## sogood:class94_1      -4.9406e-01  1.1616e+00 -0.4253  0.670604    
## good:class95_2         1.5336e-01  9.5857e-01  0.1600  0.872888    
## normal:class95_2       2.8062e-04  9.7726e-01  0.0003  0.999771    
## sobad:class95_2       -2.9243e-01  2.9761e+00 -0.0983  0.921726    
## sogood:class95_2       2.6085e-01  1.0316e+00  0.2529  0.800382    
## good:class96_3         2.4058e+00  1.0123e+00  2.3764  0.017482 *  
## normal:class96_3       2.7799e+00  1.0295e+00  2.7003  0.006928 ** 
## sobad:class96_3        1.0064e+00  3.1052e+00  0.3241  0.745862    
## sogood:class96_3       2.6673e+00  1.0777e+00  2.4750  0.013323 *  
## good:class97_4        -2.2359e-01  8.5974e-01 -0.2601  0.794808    
## normal:class97_4       4.5506e-01  8.8948e-01  0.5116  0.608928    
## sobad:class97_4        1.1019e-01  2.8478e+00  0.0387  0.969136    
## sogood:class97_4      -4.1411e-02  9.3180e-01 -0.0444  0.964552    
## good:class98_5         1.4481e-01  1.0519e+00  0.1377  0.890502    
## normal:class98_5      -6.4164e-01  1.0771e+00 -0.5957  0.551359    
## sobad:class98_5       -3.6633e-01  3.7289e+00 -0.0982  0.921741    
## sogood:class98_5      -2.3716e-01  1.0913e+00 -0.2173  0.827962    
## good:class99_6         9.9312e-01  1.0650e+00  0.9325  0.351079    
## normal:class99_6       7.2476e-01  1.0870e+00  0.6667  0.504945    
## sobad:class99_6       -2.3388e-01  3.5453e+00 -0.0660  0.947402    
## sogood:class99_6       1.4908e+00  1.1160e+00  1.3358  0.181603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -272.94
## McFadden R^2:  0.20611 
## Likelihood ratio test : chisq = 141.73 (p.value = 2.2467e-09)

분석 결과
# look at correct classification
correct <- model$probabilities
binarycorrect <- colnames(correct)[apply(correct,1,which.max)]
table(binarycorrect)
## binarycorrect
##    bad   good normal sogood 
##      3    246     32     31
table(edu$satisfaction55, binarycorrect)
##         binarycorrect
##          bad good normal sogood
##   sobad    0    1      0      0
##   bad      2    7      2      0
##   normal   1   52     17      0
##   good     0  159     10      6
##   sogood   0   27      3     25
# percent (정분류율)
(2+17+159+25)/nrow(edu)*100
## [1] 65.0641

시각화
# ggplot

cleanup <- theme(panel.grid.major = element_blank(),
                 panel.grid.minor = element_blank(),
                 panel.background = element_blank(),
                 axis.text.y = element_blank(),
                 axis.ticks = element_blank(),
                 axis.line.x = element_line(color = "black"),
                 axis.line.y = element_line(color = "black"),
                 text = element_text(size=20),
                 legend.key = element_blank())
correct <- as.data.frame(correct)
# correct$good, correct$bad 등 범주 교체 가능  
# bad 중심으로 분포 확인 
hist <- ggplot(edu, aes(correct$bad, color=satisfaction55, fill=satisfaction55))
hist + cleanup +
  geom_dotplot(binwidth=.01, position="jitter") +
  coord_cartesian(xlim=c(0,1)) +
  xlab("Likelihood of bad") +
  ylab("Frequency") +
  scale_color_manual(values=c("Blue","Green","Red","yellow", "black"),
                     labels=c("sobad","bad", "normal","good","sogood"),
                     name="Diagnosis") +
  scale_fill_manual(values=c("Blue","Green","Red","yellow", "black"),
                    labels=c("sobad","bad", "normal","good","sogood"),
                    name="Diagnosis")