edu<-read.csv('edu.csv')
nrow(edu) #rawdata 행의 개수는 6908개 이다.
[1] 6908
edu$math<-as.character(edu$math)
edu$math<-as.numeric(edu$math)
edu$language<-as.character(edu$language)
edu$language<-as.numeric(edu$language)
edu$foreign<-as.character(edu$foreign)
edu$foreign<-as.numeric(edu$foreign) #언어,수학,외국어 점수가 facotr로 되어있어 수치형으로 전환한다.
edu.rm<-na.omit(edu) #결측치를 모두 제거한다.
edu.sub<-subset(edu.rm,income>=0&educost>=0,select=-c(parreply,stureply)) #소득,교육비에서 무응답과 기재오류는 음수로 기재되어 있으므로 제거한다.
nrow(edu.sub) #결측치 제외하고 3612명의 데이터가 존재한다.
[1] 3612
hist(edu.sub$income,200,xlim=c(0,3000),probability = T,main='월평균 소득 분포의 히스토그램',xlab='월소득(만원)',col='darkgoldenrod1')
lines(density(edu.sub$income),col='navy',lwd=2)
legend(1500,0.002,'평균 406만원',bty='n')
legend(1500,0.0017,'중앙값 350만원',bty='n')
hist(edu.sub$educost,150,xlim=c(0,800),probability = T,main='월평균 교육비 지출 분포의 히스토그램',xlab='월 교육비 (만원)',col='olivedrab1')
lines(density(edu.sub$educost),col='tomato',lwd=2)
legend(450,0.01, '평균 58만원',bty='n')
legend(450,0.0085,'중앙값 50만원',bty='n')
library(reshape2)
langdf<-dcast(incomegr~langgr,data=edu.sub)[c(1:4),]
colnames(langdf)<-c('incomegr',paste0('Gr',9:1))
langdf
incomegr Gr9 Gr8 Gr7 Gr6 Gr5 Gr4 Gr3 Gr2 Gr1
1 1 57 72 122 127 180 132 83 41 25
2 2 21 58 115 131 181 131 90 56 26
3 3 15 25 64 129 186 135 97 59 30
4 4 13 28 94 111 197 162 134 69 66
library(VGAM)
model1<-vglm(cbind(Gr1,Gr2,Gr3,Gr4,Gr5,Gr6,Gr7,Gr8,Gr9)~incomegr,family = cumulative(parallel = TRUE),data=langdf)
summary(model1)
Call:
vglm(formula = cbind(Gr1, Gr2, Gr3, Gr4, Gr5, Gr6, Gr7, Gr8,
Gr9) ~ incomegr, family = cumulative(parallel = TRUE), data = langdf)
Pearson residuals:
logit(P[Y<=1]) logit(P[Y<=2]) logit(P[Y<=3]) logit(P[Y<=4])
1 0.001378 0.02663 0.4444 1.6254
2 -1.124973 0.81359 0.2252 0.1523
3 -1.180680 0.10719 -0.4300 -0.8246
4 1.784312 -0.77569 -0.1870 -0.8501
logit(P[Y<=5]) logit(P[Y<=6]) logit(P[Y<=7]) logit(P[Y<=8])
1 1.0148 -0.7160 -1.9547 -2.693
2 -0.4650 -0.8599 -0.4773 1.737
3 -0.2095 3.2028 1.5951 0.639
4 -0.3339 -1.2536 1.6747 1.063
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -3.76930 0.11241 -33.531 <2e-16 ***
(Intercept):2 -2.75982 0.09180 -30.062 <2e-16 ***
(Intercept):3 -1.86452 0.08283 -22.510 <2e-16 ***
(Intercept):4 -1.05349 0.07828 -13.458 <2e-16 ***
(Intercept):5 -0.10567 0.07600 -1.390 0.164
(Intercept):6 0.66926 0.07769 8.615 <2e-16 ***
(Intercept):7 1.68747 0.08817 19.140 <2e-16 ***
(Intercept):8 2.75936 0.11676 23.632 <2e-16 ***
incomegr 0.26936 0.02728 9.873 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Number of linear predictors: 8
Dispersion Parameter for cumulative family: 1
Residual deviance: 47.1536 on 23 degrees of freedom
Log-likelihood: -116.6102 on 23 degrees of freedom
Number of iterations: 4
Exponentiated coefficients:
incomegr
1.309121
model1.1<-vglm(cbind(Gr1,Gr2,Gr3,Gr4,Gr5,Gr6,Gr7,Gr8,Gr9)~incomegr, family = cumulative,data=langdfrm)
pchisq(2*(logLik(model1.1)-logLik(model1)),df=length(coef(model1.1))-length(coef(model1)),lower.tail = FALSE)
[1] 0.001392138
library(nnet)
model2<-multinom(langgr~incomegr,data=edu.sub)
# weights: 27 (16 variable)
initial value 7167.346571
iter 10 value 6636.127269
iter 20 value 6588.249841
final value 6588.029446
converged
summary(model2)
Call:
multinom(formula = langgr ~ incomegr, data = edu.sub)
Coefficients:
(Intercept) incomegr
Gr 8 0.1798572 0.1880819
Gr 7 0.4424347 0.4198536
Gr 6 0.4478921 0.5145472
Gr 5 0.6789001 0.5828952
Gr 4 0.3024922 0.6189410
Gr 3 -0.2606483 0.7085328
Gr 2 -0.8360391 0.7048447
Gr 1 -1.8125864 0.9005770
Std. Errors:
(Intercept) incomegr
Gr 8 0.2604003 0.1199112
Gr 7 0.2360416 0.1077512
Gr 6 0.2319340 0.1057704
Gr 5 0.2248306 0.1033565
Gr 4 0.2315193 0.1050610
Gr 3 0.2437305 0.1078395
Gr 2 0.2694719 0.1148285
Gr 1 0.3163647 0.1247855
Residual Deviance: 13176.06
AIC: 13208.06
| Coef | LowCI | HighCI | |
|---|---|---|---|
| 언어영역 | 0.90 | 1.93 | 3.14 |
library(plotly)
subj<-c('lang','lang','matha','matha','mathb','mathb','fore','fore')
hr<-c(1.92,3.14,2.07,9,1.59,2.41,1.82,3.15)
testtable<-data.frame(subj,hr)
testtable$subj<-factor(testtable$subj,levels=c('fore', 'mathb','matha','lang'), labels=c('외국어','수학 나','수학 가','언어'))
pl<-ggplot(testtable, aes(x=hr, y=subj))+geom_line(size=1, aes(colour=subj))+geom_point()+ggtitle("과목별 1등급에 속할 오즈 비")+ theme(plot.title=element_text(face="bold", size=20, vjust=0, color="darkblue"))
py <- plotly("RgraphingAPI", "ektgzomjbx")
ggplotly(pl, session='knitr')