다항 로지스틱 회귀분석
패키지 불러오기
# echo=TRUE : 코드를 평가하고 실행결과를 포함한다
# eval=TRUE : 실행결과와 함게 코드를 출력한다
# message=FALSE : 메시지를 출력한다
# warning=TRUE : 경고메시지를 출력한다
# error=FALSE : 오류메시지를 출력한다
# tidy=FALSE : 깔끔한 방식으로 코드 형태를 변형한다
library(readxl)
library(corrplot)
library(mlogit)
library(ggplot2)
데이터 불러오기
setwd("C:/Users/user/Desktop/krivet")
edu <- read_excel("data_merge_3_na.xlsx")
# 종속변수= satisfaction55, 설명변수 21개
str(edu)
## Classes 'tbl_df', 'tbl' and 'data.frame': 312 obs. of 22 variables:
## $ satisfaction55: num 4 4 4 5 4 4 4 4 2 3 ...
## $ facilities79_1: num 4 4 4 4 3 1 4 3 1 3 ...
## $ facilities80_2: num 4 4 4 2 3 2 3 4 1 3 ...
## $ facilities81_3: num 4 4 5 3 3 3 3 4 1 3 ...
## $ facilities82_4: num 4 4 5 3 3 2 3 4 1 3 ...
## $ ralation84_1 : num 4 4 4 3 3 1 3 4 1 3 ...
## $ pride85_1 : num 4 4 4 3 4 3 4 4 1 3 ...
## $ pride86_2 : num 4 4 5 3 4 3 4 4 1 3 ...
## $ schoollife87_1: num 4 4 4 3 4 4 2 4 1 3 ...
## $ schoollife88_2: num 4 4 4 3 3 1 4 4 1 3 ...
## $ schoollife89_3: num 3 4 4 3 3 2 4 4 1 3 ...
## $ schoollife90_4: num 4 4 3 3 4 3 4 4 1 3 ...
## $ schoollife91_5: num 4 4 4 3 4 4 4 4 1 3 ...
## $ schoollife92_6: num 4 4 4 3 4 4 4 4 3 4 ...
## $ schoollife93_7: num 4 4 4 1 3 3 3 4 1 3 ...
## $ class94_1 : num 4 4 4 3 4 4 4 4 3 3 ...
## $ class95_2 : num 4 4 4 3 4 4 4 4 2 3 ...
## $ class96_3 : num 4 4 4 3 4 3 4 4 2 3 ...
## $ class97_4 : num 4 4 4 3 4 3 4 4 2 4 ...
## $ class98_5 : num 4 4 4 3 4 5 4 4 3 4 ...
## $ class99_6 : num 4 4 4 3 4 5 3 4 3 4 ...
## $ prof102_1 : num 1 1 2 2 1 1 1 1 1 1 ...
summary(edu)
## satisfaction55 facilities79_1 facilities80_2 facilities81_3
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.872 Mean :3.583 Mean :3.506 Mean :3.609
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## facilities82_4 ralation84_1 pride85_1 pride86_2
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :3.000 Median :4.000
## Mean :3.622 Mean :3.712 Mean :3.465 Mean :3.679
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## schoollife87_1 schoollife88_2 schoollife89_3 schoollife90_4
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :3.000 Median :3.000 Median :3.000 Median :4.000
## Mean :3.359 Mean :3.442 Mean :3.279 Mean :3.583
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## schoollife91_5 schoollife92_6 schoollife93_7 class94_1
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :4.000 Median :3.000 Median :4.000
## Mean :3.388 Mean :3.619 Mean :2.955 Mean :3.766
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## class95_2 class96_3 class97_4 class98_5
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000
## Median :4.000 Median :4.000 Median :4.000 Median :4.000
## Mean :3.756 Mean :3.846 Mean :3.798 Mean :3.715
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## class99_6 prof102_1
## Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:1.000
## Median :4.000 Median :2.000
## Mean :3.827 Mean :1.721
## 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000
상관계수
# 분석에 사용된 변수만 선택(14개)
edu[ ,-c(2,6,12,13,14,15,22)]
## # A tibble: 312 x 15
## satisfaction55 facilities80_2 facilities81_3 facilities82_4 pride85_1
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 4 4 4 4
## 2 4 4 4 4 4
## 3 4 4 5 5 4
## 4 5 2 3 3 3
## 5 4 3 3 3 4
## 6 4 2 3 2 3
## 7 4 3 3 3 4
## 8 4 4 4 4 4
## 9 2 1 1 1 1
## 10 3 3 3 3 3
## # ... with 302 more rows, and 10 more variables: pride86_2 <dbl>,
## # schoollife87_1 <dbl>, schoollife88_2 <dbl>, schoollife89_3 <dbl>,
## # class94_1 <dbl>, class95_2 <dbl>, class96_3 <dbl>, class97_4 <dbl>,
## # class98_5 <dbl>, class99_6 <dbl>
correl <- cor(edu[ ,-c(2,6,12,13,14,15,22)])
# "circle", "square", "ellipse", "number", "shade","color", "pie"
corrplot(correl, method = c("shade"))

# 상관계수 + 컬러
corrplot.mixed(correl)

cor.test(edu$satisfaction55, edu$class96_3)
##
## Pearson's product-moment correlation
##
## data: edu$satisfaction55 and edu$class96_3
## t = 7.8883, df = 310, p-value = 5.293e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3119928 0.4973270
## sample estimates:
## cor
## 0.408867
데이터 형태 변환 및 분석
# factor the dv(종속변수 범주형으로 변환)
edu$satisfaction55 <- factor(edu$satisfaction55, level=c(1:5), label=c("sobad","bad","normal","good","sogood"))
# restructure(데이터 형태 변환)
longdata <- mlogit.data(edu, choice = "satisfaction55", shape="wide")
# run 분석(설명변수 14개 선택)
model <- mlogit(satisfaction55 ~ 1 | facilities80_2+facilities81_3+facilities82_4+
pride85_1+pride86_2+schoollife87_1+schoollife88_2+schoollife89_3+
class94_1+class95_2+class96_3+class97_4+class98_5+class99_6,
data=longdata, reflevel="bad") #reflevel 는 base 설정하기 위한 것
summary(model)
##
## Call:
## mlogit(formula = satisfaction55 ~ 1 | facilities80_2 + facilities81_3 +
## facilities82_4 + pride85_1 + pride86_2 + schoollife87_1 +
## schoollife88_2 + schoollife89_3 + class94_1 + class95_2 +
## class96_3 + class97_4 + class98_5 + class99_6, data = longdata,
## reflevel = "bad", method = "nr", print.level = 0)
##
## Frequencies of alternatives:
## bad good normal sobad sogood
## 0.0352564 0.5608974 0.2243590 0.0032051 0.1762821
##
## nr method
## 9 iterations, 0h:0m:0s
## g'(-H)^-1g = 2.67E-08
## gradient close to zero
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## good:(intercept) -7.8699e+00 2.7281e+00 -2.8847 0.003917 **
## normal:(intercept) -2.4103e+00 2.6440e+00 -0.9116 0.361977
## sobad:(intercept) -5.3357e-01 8.0147e+00 -0.0666 0.946920
## sogood:(intercept) -1.6591e+01 3.1105e+00 -5.3340 9.609e-08 ***
## good:facilities80_2 3.0305e-01 6.8533e-01 0.4422 0.658344
## normal:facilities80_2 -1.4745e-01 6.9335e-01 -0.2127 0.831585
## sobad:facilities80_2 2.9592e-01 2.1783e+00 0.1359 0.891940
## sogood:facilities80_2 2.3565e-01 7.1467e-01 0.3297 0.741600
## good:facilities81_3 -1.1685e+00 7.0196e-01 -1.6646 0.095983 .
## normal:facilities81_3 -1.2980e+00 7.1710e-01 -1.8101 0.070275 .
## sobad:facilities81_3 -1.8614e+00 2.3063e+00 -0.8071 0.419594
## sogood:facilities81_3 -1.0584e+00 7.4185e-01 -1.4267 0.153654
## good:facilities82_4 1.2637e+00 6.1596e-01 2.0516 0.040204 *
## normal:facilities82_4 1.5309e+00 6.3070e-01 2.4274 0.015209 *
## sobad:facilities82_4 9.4720e-01 1.9749e+00 0.4796 0.631494
## sogood:facilities82_4 1.0745e+00 6.4398e-01 1.6686 0.095203 .
## good:pride85_1 3.3963e-02 9.5730e-01 0.0355 0.971699
## normal:pride85_1 2.7938e-01 9.7360e-01 0.2870 0.774146
## sobad:pride85_1 6.4888e-01 3.2331e+00 0.2007 0.840933
## sogood:pride85_1 4.6274e-03 9.8265e-01 0.0047 0.996243
## good:pride86_2 1.9475e+00 8.7676e-01 2.2213 0.026333 *
## normal:pride86_2 1.2242e+00 8.9091e-01 1.3741 0.169398
## sobad:pride86_2 5.5270e-01 2.9112e+00 0.1899 0.849423
## sogood:pride86_2 2.5139e+00 9.2104e-01 2.7294 0.006345 **
## good:schoollife87_1 -4.2881e-01 7.8327e-01 -0.5475 0.584060
## normal:schoollife87_1 -2.9809e-01 8.0548e-01 -0.3701 0.711326
## sobad:schoollife87_1 -1.2389e-02 2.4764e+00 -0.0050 0.996008
## sogood:schoollife87_1 -4.1482e-01 8.2230e-01 -0.5045 0.613937
## good:schoollife88_2 -2.3139e-01 7.9946e-01 -0.2894 0.772250
## normal:schoollife88_2 -5.5247e-01 8.1345e-01 -0.6792 0.497028
## sobad:schoollife88_2 -4.5565e-01 2.6493e+00 -0.1720 0.863447
## sogood:schoollife88_2 1.9863e-02 8.5194e-01 0.0233 0.981399
## good:schoollife89_3 -1.1345e+00 7.3996e-01 -1.5333 0.125214
## normal:schoollife89_3 -1.0162e+00 7.6255e-01 -1.3326 0.182658
## sobad:schoollife89_3 1.6310e-01 2.6384e+00 0.0618 0.950706
## sogood:schoollife89_3 -9.2099e-01 7.7462e-01 -1.1890 0.234455
## good:class94_1 -8.5247e-01 1.0845e+00 -0.7860 0.431852
## normal:class94_1 -1.6647e+00 1.1024e+00 -1.5101 0.131011
## sobad:class94_1 -9.2264e-01 3.4115e+00 -0.2705 0.786814
## sogood:class94_1 -4.9406e-01 1.1616e+00 -0.4253 0.670604
## good:class95_2 1.5336e-01 9.5857e-01 0.1600 0.872888
## normal:class95_2 2.8062e-04 9.7726e-01 0.0003 0.999771
## sobad:class95_2 -2.9243e-01 2.9761e+00 -0.0983 0.921726
## sogood:class95_2 2.6085e-01 1.0316e+00 0.2529 0.800382
## good:class96_3 2.4058e+00 1.0123e+00 2.3764 0.017482 *
## normal:class96_3 2.7799e+00 1.0295e+00 2.7003 0.006928 **
## sobad:class96_3 1.0064e+00 3.1052e+00 0.3241 0.745862
## sogood:class96_3 2.6673e+00 1.0777e+00 2.4750 0.013323 *
## good:class97_4 -2.2359e-01 8.5974e-01 -0.2601 0.794808
## normal:class97_4 4.5506e-01 8.8948e-01 0.5116 0.608928
## sobad:class97_4 1.1019e-01 2.8478e+00 0.0387 0.969136
## sogood:class97_4 -4.1411e-02 9.3180e-01 -0.0444 0.964552
## good:class98_5 1.4481e-01 1.0519e+00 0.1377 0.890502
## normal:class98_5 -6.4164e-01 1.0771e+00 -0.5957 0.551359
## sobad:class98_5 -3.6633e-01 3.7289e+00 -0.0982 0.921741
## sogood:class98_5 -2.3716e-01 1.0913e+00 -0.2173 0.827962
## good:class99_6 9.9312e-01 1.0650e+00 0.9325 0.351079
## normal:class99_6 7.2476e-01 1.0870e+00 0.6667 0.504945
## sobad:class99_6 -2.3388e-01 3.5453e+00 -0.0660 0.947402
## sogood:class99_6 1.4908e+00 1.1160e+00 1.3358 0.181603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -272.94
## McFadden R^2: 0.20611
## Likelihood ratio test : chisq = 141.73 (p.value = 2.2467e-09)
분석 결과
# look at correct classification
correct <- model$probabilities
binarycorrect <- colnames(correct)[apply(correct,1,which.max)]
table(binarycorrect)
## binarycorrect
## bad good normal sogood
## 3 246 32 31
table(edu$satisfaction55, binarycorrect)
## binarycorrect
## bad good normal sogood
## sobad 0 1 0 0
## bad 2 7 2 0
## normal 1 52 17 0
## good 0 159 10 6
## sogood 0 27 3 25
# percent (정분류율)
(2+17+159+25)/nrow(edu)*100
## [1] 65.0641
시각화
# ggplot
cleanup <- theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.line.x = element_line(color = "black"),
axis.line.y = element_line(color = "black"),
text = element_text(size=20),
legend.key = element_blank())
correct <- as.data.frame(correct)
# correct$good, correct$bad 등 범주 교체 가능
# bad 중심으로 분포 확인
hist <- ggplot(edu, aes(correct$bad, color=satisfaction55, fill=satisfaction55))
hist + cleanup +
geom_dotplot(binwidth=.01, position="jitter") +
coord_cartesian(xlim=c(0,1)) +
xlab("Likelihood of bad") +
ylab("Frequency") +
scale_color_manual(values=c("Blue","Green","Red","yellow", "black"),
labels=c("sobad","bad", "normal","good","sogood"),
name="Diagnosis") +
scale_fill_manual(values=c("Blue","Green","Red","yellow", "black"),
labels=c("sobad","bad", "normal","good","sogood"),
name="Diagnosis")
