edu<-read.csv('edu.csv')
nrow(edu) #rawdata 행의 개수는 6908개 이다.
[1] 6908
edu$math<-as.character(edu$math)
edu$math<-as.numeric(edu$math)
edu$language<-as.character(edu$language)
edu$language<-as.numeric(edu$language)
edu$foreign<-as.character(edu$foreign)
edu$foreign<-as.numeric(edu$foreign) #언어,수학,외국어 점수가 facotr로 되어있어 수치형으로 전환한다.
edu.rm<-na.omit(edu) #결측치를 모두 제거한다.
edu.sub<-subset(edu.rm,income>1&educost>1,select=-c(parreply,stureply)) #소득,교육비에서 무응답과 기재오류는 음수로 기재되어 있으므로 제거한다.
nrow(edu.sub) #결측치 제외하고 3063명의 데이터가 존재한다.
[1] 3063
hist(edu.sub$income,200,xlim=c(0,3000),probability = T,main='월평균 소득 분포의 히스토그램',xlab='월소득(만원)',col='darkgoldenrod1')
lines(density(edu.sub$income),col='navy',lwd=2)
legend(1500,0.002,'평균 406만원',bty='n')
legend(1500,0.0017,'중앙값 350만원',bty='n')
hist(edu.sub$educost,150,xlim=c(0,800),probability = T,main='월평균 교육비 지출 분포의 히스토그램',xlab='월 교육비 (만원)',col='olivedrab1')
lines(density(edu.sub$educost),col='tomato',lwd=2)
legend(450,0.01, '평균 58만원',bty='n')
legend(450,0.0085,'중앙값 50만원',bty='n')
#소득을 25%,50%,75% 기준으로 나누고 1,2,3,4 로 coding한다.
q<-quantile(edu.sub$income,c(0,.25,.50,.75,1))
edu.sub$incomegr<-cut(edu.sub$income,q)
edu.sub$incomegr<-as.numeric(edu.sub$incomegr)
#언어영역 표준점수를 등급으로 나눈다.
edu.sub$langgr<-cut(edu.sub$language,c(150,129,123,116,108,97,86,72,60,1),right = FALSE,levels<-c(9:1),labels = paste('Gr',9:1))
#수학 가형, 나형 각각의 등급을 구하고, 데이터를 다시 합친다.
mathA<-subset(edu.sub,mathtype=='a') #수학 가형 응시자만 선택한다
mathB<-subset(edu.sub,mathtype=='b') #수학 나형 응시지만 선택한다
mathA$mathgr<-cut(mathA$math,c(150,132,124,116,107,96,83,72,65,1),right = FALSE,levels<-c(9:1),labels = paste('Gr',9:1))
mathB$mathgr<-cut(mathB$math,c(150,139,129,117,103,89,83,79,75,1),right = FALSE,levels<-c(9:1),labels = paste('Gr',9:1))
edu.sub<-rbind(mathA,mathB)
#외국어 영역 표준점수를 등급으로 나눈다
edu.sub$foregr<-cut(edu.sub$foreign,c(150,132,125,117,107,95,82,72,66,1),right = FALSE,levels<-c(9:1),labels = paste('Gr',9:1))
edu.cut<-edu.sub[,-c(5:7)]
edu.re <- subset(edu.cut, select=c(1,2,3,4,6,7,5,8,9))
colnames(edu.re)<-c('ID','성별','수입','교육비','소득 등급','언어 등급','수리 선택','수리 등급','외국어 등급')
library(googleVis)
cuttable <- gvisTable(edu.re, options = list(width = 600, height = 300, page = "enable"))
op<-options(gvis.plot.tage=NULL)
cuttable
edu.cut$lie<-ifelse(edu.cut$educost>edu.cut$income,1,0) #ifelse함수를 통해 소득보다 교육비 지출이 큰 학생을 1로 표시한다.
edu.truth<-subset(edu.cut, lie==0)
library(reshape2)
langdftruth<-dcast(incomegr~langgr,data=edu.truth)[c(1:4),]
colnames(langdftruth)<-c('incomegr',paste0('Gr',9:1))
langdftruth
incomegr Gr9 Gr8 Gr7 Gr6 Gr5 Gr4 Gr3 Gr2 Gr1
1 1 53 62 144 151 201 153 100 57 26
2 2 13 46 81 128 181 128 94 53 24
3 3 15 30 87 140 224 164 125 75 49
4 4 5 14 43 48 95 90 72 32 42
library(VGAM)
model1<-vglm(cbind(Gr1,Gr2,Gr3,Gr4,Gr5,Gr6,Gr7,Gr8,Gr9)~incomegr,family = cumulative(parallel = TRUE),data=langdftruth)
summary(model1)
Call:
vglm(formula = cbind(Gr1, Gr2, Gr3, Gr4, Gr5, Gr6, Gr7, Gr8,
Gr9) ~ incomegr, family = cumulative(parallel = TRUE), data = langdftruth)
Pearson residuals:
logit(P[Y<=1]) logit(P[Y<=2]) logit(P[Y<=3]) logit(P[Y<=4])
1 -1.06387 0.75583 0.6470 1.59238
2 -1.41970 0.06556 0.2636 -0.25094
3 -0.02181 0.17216 -0.3873 -1.30520
4 2.49904 -1.14302 -0.5563 0.02872
logit(P[Y<=5]) logit(P[Y<=6]) logit(P[Y<=7]) logit(P[Y<=8])
1 0.15487 -1.286 -0.5865 -2.7262
2 -0.07493 1.258 -0.8288 2.1137
3 -0.18296 1.210 1.3323 0.8767
4 0.22029 -1.353 0.5019 0.9393
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept):1 -3.691252 0.114496 -32.239 <2e-16 ***
(Intercept):2 -2.675798 0.093339 -28.667 <2e-16 ***
(Intercept):3 -1.772170 0.084134 -21.064 <2e-16 ***
(Intercept):4 -0.956249 0.079575 -12.017 <2e-16 ***
(Intercept):5 0.003624 0.077604 0.047 0.963
(Intercept):6 0.808492 0.080084 10.096 <2e-16 ***
(Intercept):7 1.867721 0.093188 20.043 <2e-16 ***
(Intercept):8 2.944691 0.126516 23.275 <2e-16 ***
incomegr 0.276772 0.030508 9.072 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Number of linear predictors: 8
Dispersion Parameter for cumulative family: 1
Residual deviance: 39.861 on 23 degrees of freedom
Log-likelihood: -110.8192 on 23 degrees of freedom
Number of iterations: 4
Exponentiated coefficients:
incomegr
1.318865
model1.1<-vglm(cbind(Gr1,Gr2,Gr3,Gr4,Gr5,Gr6,Gr7,Gr8,Gr9)~incomegr, family = cumulative,data=langdftruth)
pchisq(2*(logLik(model1.1)-logLik(model1)),df=length(coef(model1.1))-length(coef(model1)),lower.tail = FALSE)
[1] 0.002081759
alt text
library(nnet)
model2<-multinom(langgr~incomegr,data=edu.truth)
# weights: 27 (16 variable)
initial value 6690.548838
iter 10 value 6175.031827
iter 20 value 6118.234568
final value 6118.187637
converged
summary(model2)
Call:
multinom(formula = langgr ~ incomegr, data = edu.truth)
Coefficients:
(Intercept) incomegr
Gr 8 -0.04705314 0.3392090
Gr 7 0.58548755 0.4456064
Gr 6 0.65775094 0.5403501
Gr 5 0.81335068 0.6520786
Gr 4 0.43105284 0.7001631
Gr 3 -0.04486371 0.7679590
Gr 2 -0.51593722 0.7190208
Gr 1 -1.84652899 1.0689233
Std. Errors:
(Intercept) incomegr
Gr 8 0.2943594 0.1476915
Gr 7 0.2621875 0.1337102
Gr 6 0.2566944 0.1310782
Gr 5 0.2504067 0.1285139
Gr 4 0.2563584 0.1300916
Gr 3 0.2660455 0.1325308
Gr 2 0.2874136 0.1392745
Gr 1 0.3381613 0.1490537
Residual Deviance: 12236.38
AIC: 12268.38
subj<-c('lang','lang','matha','matha','mathb','mathb','fore','fore')
hr<-c(1.92,3.14,2.07,9,1.59,2.41,1.82,3.15)
testtable<-data.frame(subj,hr)
testtable$subj<-factor(testtable$subj,levels=c('fore', 'mathb','matha','lang'), labels=c('외국어','수학 나','수학 가','언어'))
pl<-ggplot(testtable, aes(x=hr, y=subj))+geom_line(size=1, aes(colour=subj))+geom_point()+ggtitle("과목별 1등급에 속할 오즈 비")+ theme(plot.title=element_text(face="bold", size=20, vjust=0, color="darkblue"))
pl