Chose variables

讀取變項,整理檔案

ML_raw <- read.table("D:/104/ML_R/WOW_data.csv", 
                    header=TRUE, sep=",")
ML<-ML_raw[c(5,6,1,22,4,12)]
attach(ML)
head(ML)
##   senson school_3  X reading_1 gender reading_3
## 1      1      206  3       472      0       492
## 2      1      101 15       478      0       494
## 3      1      103 16       500      0       523
## 4      1      104 30       431      1       476
## 5      1      107 64       500      0       515
## 6      1      101 66       429      0       450

以學校為目標計算平均數,並將資料以學校併入

ML_M<-aggregate(cbind(reading_1,reading_3,gender) ~school_3,data=ML,mean)
names(ML_M)<-c("school_3","reading_1_mean","reading_3_mean","man_ratio")
ML_sd<-aggregate(cbind(reading_1,reading_3) ~school_3,data=ML,sd)
names(ML_sd)[2:3]<-c("reading_1_SD","reading_3_SD")
head(ML_M)
##   school_3 reading_1_mean reading_3_mean man_ratio
## 1      101       474.4444       498.6667 0.4444444
## 2      102       482.3333       505.6667 0.6666667
## 3      103       464.2500       487.7500 0.5000000
## 4      104       469.1250       488.7500 0.6250000
## 5      105       469.0000       492.5000 0.4166667
## 6      106       482.0000       498.8571 0.4285714

Loading packages

載入套件畫圖

library(ggplot2)
library(lattice)
library(plotly)
library(plyr)

將每位學生重新排序,看一下資料

ML_r<-ML[c(4,6)]
ML_r<-stack(ML_r)
head(ML_r)
##   values       ind
## 1    472 reading_1
## 2    478 reading_1
## 3    500 reading_1
## 4    431 reading_1
## 5    500 reading_1
## 6    429 reading_1

將各校平均分數重新排列

ML_mm<-ML_M[c(2,3)]
ML_mm<-stack(ML_mm)
head(ML_mm)
##     values            ind
## 1 474.4444 reading_1_mean
## 2 482.3333 reading_1_mean
## 3 464.2500 reading_1_mean
## 4 469.1250 reading_1_mean
## 5 469.0000 reading_1_mean
## 6 482.0000 reading_1_mean

Plot

直方圖(所有學生)

p1<- ggplot(ML_r, aes(x=values, fill=ind)) +
     geom_histogram(binwidth=.5, position="dodge")+
     xlab("raw_score") + ylab("count")

ggplotly(p1)

機率分佈(所有學生)

p2<-ggplot(ML_r, aes(x=values, fill=ind)) + geom_density(alpha=.3)+
    xlab("raw_score") #y=PDF
ggplotly(p2)

直方圖(所有學校)

p3<-ggplot(ML_mm, aes(x=values, fill=ind)) +
    geom_histogram(binwidth=.5, position="dodge")+
    xlab("average score of school") + ylab("count")
ggplotly(p3)

機率分佈(所有學校)

p4<-ggplot(ML_mm, aes(x=values, fill=ind)) + geom_density(alpha=.3)+
    xlab("average score of school")
ggplotly(p4)

Regression

學生層次相關係數

cor(ML[c(4,6)])
##           reading_1 reading_3
## reading_1 1.0000000 0.8709099
## reading_3 0.8709099 1.0000000

學校層次相關係數(生態相關)

cor(ML_M[c(-1,-4)])
##                reading_1_mean reading_3_mean
## reading_1_mean      1.0000000      0.8838805
## reading_3_mean      0.8838805      1.0000000

整理排序(把學生跟學校都放進來,另外加一個type)

ML$senson<-as.factor(ML$senson)
ML_m1<-ML_M[c(2,3)]
names(ML_m1)<- c("reading_1","reading_3")
ML_m2<-rbind(ML_m1,ML[c(4,6)])
ML_m2$type<-rep(c("school","student"), c(33,193))#+type
head(ML_m2)
##   reading_1 reading_3   type
## 1  474.4444  498.6667 school
## 2  482.3333  505.6667 school
## 3  464.2500  487.7500 school
## 4  469.1250  488.7500 school
## 5  469.0000  492.5000 school
## 6  482.0000  498.8571 school

繪製全部學生及學校的散佈圖+迴歸線

p5<-ggplot(ML_m2, aes(x=reading_1, y=reading_3, color=type, shape=type)) +
    geom_point(size=2) + 
    geom_smooth(method=lm,aes(fill=type),size=2)
ggplotly(p5)

三年級數學成績預測五年級數學成績(學校)

p6<-ggplot(data=ML,aes(x=reading_1,y=reading_3,group=school_3))+
    stat_smooth(method = 'lm',se=F,color='#999999')+
    geom_point(size=2.5,color='#56B4E9')+
    stat_smooth(aes(group=1),method = 'lm',se=F,color='black')+
    labs(x='reading_1',y='reading_3',title='school')
ggplotly(p6)

三年級數學成績預測五年級數學成績(senson)

p7<-ggplot(data=ML,aes(x=reading_1,y=reading_3,group=senson,shape=senson))+
    geom_smooth(aes(color=senson),method = "lm", se = F)+
    geom_point(aes(color=senson),size=2.5)+
    stat_smooth(aes(group=1),method = 'lm',se=F,color='black')+
    labs(x='reading_1',y='reading_3',title='senson')+
    scale_colour_manual(values=c("#56B4E9", "#009E73"))
ggplotly(p7)

Mean Center

將自變項以總分置中

ML$reading_1_center<-scale(ML$reading_1,scale=F)

把senson和學校轉成factor

將性別轉換成文字(0=fegender,1=gender)

ML$school_3<-as.factor(ML$school_3)
ML$gender <- mapvalues(ML$gender,
                     from = c(0,1),
                     to = c("female", "male"))
head(ML)
##   senson school_3  X reading_1 gender reading_3 reading_1_center
## 1      1      206  3       472 female       492       -5.2642487
## 2      1      101 15       478 female       494        0.7357513
## 3      1      103 16       500 female       523       22.7357513
## 4      1      104 30       431   male       476      -46.2642487
## 5      1      107 64       500 female       515       22.7357513
## 6      1      101 66       429 female       450      -48.2642487

隨機選取學校及區域

ns<-sample(levels(ML$school_3),9)
nc<-sample(levels(ML$senson),2)

Mean Center plot

學校

p8<-ggplot(data=ML[ML$school_3 %in% ns,],aes(x=reading_1_center,y=reading_3,color=gender))+
    geom_point(size=1)+
    stat_smooth(method = 'lm',se=F)+
    facet_wrap(~school_3)
ggplotly(p8)

senson

p9<-ggplot(data=ML[ML$senson %in% nc,],aes(x=reading_1_center,y=reading_3,color=gender))+
    geom_point(size=1)+
    stat_smooth(method = 'lm',se=F)+
    facet_wrap(~senson)
ggplotly(p9)