pacman::p_load(WWGbook)
data(classroom, package="WWGbook")?classroom #help classroomstr(classroom) #data structure of classroom'data.frame': 1190 obs. of 12 variables:
$ sex : int 1 0 1 0 0 1 0 0 1 0 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
$ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
$ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
$ yearstea: num 1 1 1 2 2 2 2 2 2 2 ...
$ mathknow: num NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
$ mathprep: num 2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
$ classid : int 160 160 160 217 217 217 217 217 217 217 ...
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ childid : int 1 2 3 4 5 6 7 8 9 10 ...
names(classroom) [1] "sex" "minority" "mathkind" "mathgain" "ses" "yearstea"
[7] "mathknow" "housepov" "mathprep" "classid" "schoolid" "childid"
dta_sch<-table(classroom$schoolid)
#classroom$schoolid的值與次數作成table,存成dta_sch
#dta_sch|>as.data.frame() 表格太長先省略
#以data.frame的樣貌呈現dta_sch表格,發現有107個數值
dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE,
c("schoolid", "housepov")]
#schoolid除了第一次出現的數值外,刪除重複的值
#將檔案存成dta_schl,僅保留 c("schoolid", "housepov")
str(dta_schl)'data.frame': 107 obs. of 2 variables:
$ schoolid: int 1 2 3 4 5 6 7 8 9 10 ...
$ housepov: num 0.082 0.082 0.086 0.365 0.511 0.044 0.148 0.085 0.537 0.346 ...
#共有107 obs. of 2 variables,與dta_sch一致
head(dta_schl) schoolid housepov
1 1 0.082
12 2 0.082
22 3 0.086
36 4 0.365
42 5 0.511
48 6 0.044
dta_sch2 <- classroom[duplicated(classroom$schoolid)==TRUE,
c("schoolid", "housepov")]
#[duplicated(classroom$schoolid)==TRUE,刪除重複的第一筆資料,保留其他的資料。
str(dta_sch2)'data.frame': 1083 obs. of 2 variables:
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
#共有1083 obs. of 2 variables
#str(classroom)有1190 obs.- str(dta_schl)有 170 obs.=1083 obs.dta_cl<-table(classroom$classid)
#dta_cl|>as.data.frame() 表格太長先省略
#classid的值與分布次數統計,共有312個值(1~312)
dta_cls <- classroom[duplicated(classroom$classid)==FALSE,
c(11, 10, 6,7,9)]
#classid除了第一次出現的數值外,刪除重複的值
#將檔案存成dta_cls,僅保留c(11, 10, 6, 7, 9) c("schoolid", "classid", "yearstea", "mathknow", "mathprep")
knitr::include_graphics("names(classroom).png") #原來的12個variables
str(dta_cls)'data.frame': 312 obs. of 5 variables:
$ schoolid: int 1 1 2 2 2 3 3 3 3 4 ...
$ classid : int 160 217 197 211 307 11 137 145 228 48 ...
$ yearstea: num 1 2 1 2 12.5 ...
$ mathknow: num NA -0.11 -1.25 -0.72 NA 0.45 0.99 1.61 1.14 -1.05 ...
$ mathprep: num 2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
#共有312 obs. of 5 variables,與table(classroom$classid)結果一致
head(dta_cls) schoolid classid yearstea mathknow mathprep
1 1 160 1.00 NA 2.00
4 1 217 2.00 -0.11 3.25
12 2 197 1.00 -1.25 2.50
14 2 211 2.00 -0.72 2.33
18 2 307 12.54 NA 2.30
22 3 11 20.00 0.45 3.83
#列出前面六筆資料knitr::include_graphics("names(classroom).png")#原來的12個variables
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
#保留classroom 第12、10、11、1~5共8個column,存成dta_chld
str(dta_chld)'data.frame': 1190 obs. of 8 variables:
$ childid : int 1 2 3 4 5 6 7 8 9 10 ...
$ classid : int 160 160 160 217 217 217 217 217 217 217 ...
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ sex : int 1 0 1 0 0 1 0 0 1 0 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 448 460 511 449 425 450 452 443 422 480 ...
$ mathgain: int 32 109 56 83 53 65 51 66 88 -7 ...
$ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
#data.frame': 1190 obs. of 8 variables
dta_chld|>as.data.frame()|>head(3) childid classid schoolid sex minority mathkind mathgain ses
1 1 160 1 1 1 448 32 0.46
2 2 160 1 0 1 460 109 -0.27
3 3 160 1 1 1 511 56 -0.03
#用|>as.data.frame檢查一下classroom[, c(12, 10, 11, 1:5)]原來資料的順序一致,列舉前3筆資料#Apply a Function over a List or Vector
dta_sap<-sapply(list(dta_schl, dta_cls, dta_chld), dim)
#運用sapply作成一個list
class(dta_sap)[1] "matrix" "array"
#是2x3 matrix也是array
str(dta_sap) int [1:2, 1:3] 107 2 312 5 1190 8
#資料皆為integer(整數)
dta_sap [,1] [,2] [,3]
[1,] 107 312 1190
[2,] 2 5 8
#把資料列出來
dta_sapp<-sapply(list(dta_schl$schoolid, dta_cls$schoolid, dta_chld$schoolid), mean)
#也可以用sapply列出資料的統計值,以mean為例
class(dta_sapp)[1] "numeric"
#numeric
str(dta_sapp) num [1:3] 54 52.6 52.9
#資料皆為numeric(數值,有小數)
dta_sapp[1] 54.00000 52.61859 52.94034
#把資料列出來,確實有小數dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
#將x=dta_chld(1190 obs. of 8 variables), y=dta_cls(312 obs. of 5 variables)兩個 data.frame 透過欄位by=c("classid", "schoolid")一樣的進行合併
#dta_cls有【"classid"+"schoolid"】2個以外,還有另外6個【"childid"+"sex"+"minority"+"mathkind"+"mathgain"+"ses"】
#dta_cls有【"classid"+"schoolid"】2個以外,還有另外3個【"yearstea"+"mathknow"+"mathprep"】
names(dta_12) [1] "classid" "schoolid" "childid" "sex" "minority" "mathkind"
[7] "mathgain" "ses" "yearstea" "mathknow" "mathprep"
#合併後資料共有11個column
#合併後column names按照合併順序+"x=dta_chld"+"y=dta_cls"
str(dta_12)'data.frame': 1190 obs. of 11 variables:
$ classid : int 1 1 1 1 1 10 10 100 101 101 ...
$ schoolid: int 61 61 61 61 61 17 17 33 20 20 ...
$ childid : int 653 654 655 656 657 216 217 396 249 250 ...
$ sex : int 0 0 1 1 0 1 0 1 0 1 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 442 422 482 489 460 381 419 443 484 417 ...
$ mathgain: int 35 109 66 19 10 143 69 39 68 79 ...
$ ses : num -0.09 -1.32 0.43 -0.06 -1.28 -0.49 -1.35 -1.32 -0.98 -1.16 ...
$ yearstea: num 2 2 2 2 2 5 5 19 6 6 ...
$ mathknow: num -0.72 -0.72 -0.72 -0.72 -0.72 -1.26 -1.26 0.12 -0.84 -0.84 ...
$ mathprep: num 2.5 2.5 2.5 2.5 2.5 3.25 3.25 4 2.25 2.25 ...
#兩個data.frame合併by=c("classid", "schoolid"),有就保留
#合併後共有1190 obs. of 11 variablesdta_12c <- merge(x=dta_chld, y=dta_cls, by="classid")
#將x=dta_chld, y=dta_cls兩個 data.frame 透過欄位by="classid"進行合併
names(dta_12c) [1] "classid" "childid" "schoolid.x" "sex" "minority"
[6] "mathkind" "mathgain" "ses" "schoolid.y" "yearstea"
[11] "mathknow" "mathprep"
#column變成12個,因為保留了X=dta_chld的"schoolid.x"和y=dta_cls的"schoolid.y"
str(dta_12c)'data.frame': 1190 obs. of 12 variables:
$ classid : int 1 1 1 1 1 2 2 2 3 3 ...
$ childid : int 653 654 655 656 657 402 403 404 613 614 ...
$ schoolid.x: int 61 61 61 61 61 34 34 34 56 56 ...
$ sex : int 0 0 1 1 0 0 0 1 1 1 ...
$ minority : int 1 1 1 1 1 1 0 1 0 0 ...
$ mathkind : int 442 422 482 489 460 453 472 437 506 516 ...
$ mathgain : int 35 109 66 19 10 60 52 85 -4 18 ...
$ ses : num -0.09 -1.32 0.43 -0.06 -1.28 -0.41 -1.61 0.55 -0.43 -0.26 ...
$ schoolid.y: int 61 61 61 61 61 34 34 34 56 56 ...
$ yearstea : num 2 2 2 2 2 2 2 2 13 13 ...
$ mathknow : num -0.72 -0.72 -0.72 -0.72 -0.72 0.58 0.58 0.58 0.85 0.85 ...
$ mathprep : num 2.5 2.5 2.5 2.5 2.5 3 3 3 2.75 2.75 ...
#合併後共有1190 obs. of 12 variables(含"schoolid.x"、"schoolid.y")#原來dta_13的題目和dta_12是一樣的,都是 merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
#改一下dta_13<- merge(x=dta_chld, y=dta_cls, by="classid", "schoolid")
dta_13 <- merge(x=dta_chld, y=dta_cls, by="classid", "schoolid")
names(dta_13) [1] "schoolid" "childid" "classid" "sex" "minority"
[6] "mathkind" "mathgain" "ses" "schoolid.y" "yearstea"
[11] "mathknow" "mathprep"
#多了一個schoolid.y
str(dta_13)'data.frame': 1190 obs. of 12 variables:
$ schoolid : int 1 1 1 1 1 1 1 1 1 1 ...
$ childid : int 1 2 3 4 5 6 7 8 9 10 ...
$ classid : int 160 160 160 217 217 217 217 217 217 217 ...
$ sex : int 1 0 1 0 0 1 0 0 1 0 ...
$ minority : int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind : int 448 460 511 449 425 450 452 443 422 480 ...
$ mathgain : int 32 109 56 83 53 65 51 66 88 -7 ...
$ ses : num 0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
$ schoolid.y: int 61 61 61 61 61 61 61 61 61 61 ...
$ yearstea : num 2 2 2 2 2 2 2 2 2 2 ...
$ mathknow : num -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 ...
$ mathprep : num 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
#1190 obs. of 12 variablesmerge(x=dta_chld, y=dta_cls, by=c(“classid”, “schoolid”))
merge(x=dta_chld, y=dta_cls, by=“classid”, “schoolid”)
by=c(“name”)會以column處理,相同的column只會呈現x=data的資料
by=“name”,y=data若遇到x=data相同的coulmn,也會一併呈現
dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")
#將x=dta_cls(5 variances), y=dta_chld(8 variances)兩個 data.frame 透過欄位by="schoolid"進行合併
#x=dta_cls資料中對應y=dta_schl有"schoolid"併過去
names(dta_23)[1] "schoolid" "classid" "yearstea" "mathknow" "mathprep" "housepov"
#以x=dta_cls資料為主軸,共有6個variables(資料原來的那6個)
str(dta_23)'data.frame': 312 obs. of 6 variables:
$ schoolid: int 1 1 2 2 2 3 3 3 3 4 ...
$ classid : int 160 217 197 211 307 11 137 145 228 48 ...
$ yearstea: num 1 2 1 2 12.5 ...
$ mathknow: num NA -0.11 -1.25 -0.72 NA 0.45 0.99 1.61 1.14 -1.05 ...
$ mathprep: num 2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.086 0.086 0.086 0.086 0.365 ...
#剩下觀察值312 obs. of 6 variables,和原來dta_cls資料是一樣的
summary(dta_23) schoolid classid yearstea mathknow
Min. : 1.00 Min. : 1.00 Min. : 0.00 Min. :-2.50000
1st Qu.: 24.75 1st Qu.: 78.75 1st Qu.: 4.00 1st Qu.:-0.76000
Median : 53.00 Median :156.50 Median :10.00 Median :-0.19000
Mean : 52.62 Mean :156.50 Mean :12.28 Mean :-0.08025
3rd Qu.: 79.00 3rd Qu.:234.25 3rd Qu.:20.00 3rd Qu.: 0.62000
Max. :107.00 Max. :312.00 Max. :40.00 Max. : 2.61000
NA's :27
mathprep housepov
Min. :1.000 Min. :0.0120
1st Qu.:2.000 1st Qu.:0.0850
Median :2.300 Median :0.1420
Mean :2.577 Mean :0.1908
3rd Qu.:3.000 3rd Qu.:0.2630
Max. :6.000 Max. :0.5640
summary(dta_cls) schoolid classid yearstea mathknow
Min. : 1.00 Min. : 1.00 Min. : 0.00 Min. :-2.50000
1st Qu.: 24.75 1st Qu.: 78.75 1st Qu.: 4.00 1st Qu.:-0.76000
Median : 53.00 Median :156.50 Median :10.00 Median :-0.19000
Mean : 52.62 Mean :156.50 Mean :12.28 Mean :-0.08025
3rd Qu.: 79.00 3rd Qu.:234.25 3rd Qu.:20.00 3rd Qu.: 0.62000
Max. :107.00 Max. :312.00 Max. :40.00 Max. : 2.61000
NA's :27
mathprep
Min. :1.000
1st Qu.:2.000
Median :2.300
Mean :2.577
3rd Qu.:3.000
Max. :6.000
#確認(dta_23)和(dta_cls)一模一樣
dta_12s <- merge(x=dta_chld, y=dta_cls, by="schoolid")
#將x=dta_chld(8 variances), y=dta_cls(5 variances)兩個 data.frame 透過欄位by="schoolid"進行合併
#x=dta_chld其實包含了y=dta_cls所有資料
names(dta_12s) [1] "schoolid" "childid" "classid.x" "sex" "minority" "mathkind"
[7] "mathgain" "ses" "classid.y" "yearstea" "mathknow" "mathprep"
#column變成12個,因為保留了X=dta_chld的"classid.x"和y=dta_cls的"classid.y"
str(dta_12s)'data.frame': 4138 obs. of 12 variables:
$ schoolid : int 1 1 1 1 1 1 1 1 1 1 ...
$ childid : int 1 1 2 2 3 3 4 4 5 5 ...
$ classid.x: int 160 160 160 160 160 160 217 217 217 217 ...
$ sex : int 1 1 0 0 1 1 0 0 0 0 ...
$ minority : int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind : int 448 448 460 460 511 511 449 449 425 425 ...
$ mathgain : int 32 32 109 109 56 56 83 83 53 53 ...
$ ses : num 0.46 0.46 -0.27 -0.27 -0.03 -0.03 -0.38 -0.38 -0.03 -0.03 ...
$ classid.y: int 160 217 160 217 160 217 160 217 160 217 ...
$ yearstea : num 1 2 1 2 1 2 1 2 1 2 ...
$ mathknow : num NA -0.11 NA -0.11 NA -0.11 NA -0.11 NA -0.11 ...
$ mathprep : num 2 3.25 2 3.25 2 3.25 2 3.25 2 3.25 ...
#不知道為何變成4138 obs???dta_12s <- merge(x=dta_chld, y=dta_cls, by=“schoolid”) 不知道為何str(dta_12s)變成4138 obs???
dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))
names(dta_12) [1] "classid" "schoolid" "childid" "sex" "minority" "mathkind"
[7] "mathgain" "ses" "yearstea" "mathknow" "mathprep"
#x=dta_12 1190 obs. of 11 variable
#"schoolid" "classid" "childid" "sex" "minority" "mathkind" "mathgain" "ses" "yearstea" "mathknow" "mathprep" "housepov"
names(dta_schl)[1] "schoolid" "housepov"
#y=dta_schl 107 obs. of 2 variable
#"schoolid" "housepov"
names(dta_123) [1] "schoolid" "classid" "childid" "sex" "minority" "mathkind"
[7] "mathgain" "ses" "yearstea" "mathknow" "mathprep" "housepov"
#以x=dta_12,依據c("schoolid")把y=dta_schl的資料帶入
#c("housepov")是重複的column,會自動融合只顯示一個
str(dta_123)'data.frame': 1190 obs. of 12 variables:
$ schoolid: int 1 1 1 1 1 1 1 1 1 1 ...
$ classid : int 217 217 160 217 217 160 217 160 217 217 ...
$ childid : int 6 9 1 8 11 2 10 3 7 4 ...
$ sex : int 1 1 1 0 0 0 0 1 0 0 ...
$ minority: int 1 1 1 1 1 1 1 1 1 1 ...
$ mathkind: int 450 422 448 443 502 460 480 511 452 449 ...
$ mathgain: int 65 88 32 66 60 109 -7 56 51 83 ...
$ ses : num 0.76 0.64 0.46 0.2 0.83 -0.27 0.13 -0.03 -0.03 -0.38 ...
$ yearstea: num 2 2 1 2 2 1 2 1 2 2 ...
$ mathknow: num -0.11 -0.11 NA -0.11 -0.11 NA -0.11 NA -0.11 -0.11 ...
$ mathprep: num 3.25 3.25 2 3.25 3.25 2 3.25 2 3.25 3.25 ...
$ housepov: num 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
#'data.frame': 1190 obs. of 12 variablessapply(list(dta_12, dta_13, dta_23, dta_123), dim) [,1] [,2] [,3] [,4]
[1,] 1190 1190 312 1190
[2,] 11 12 6 12
#apply dta_12, dta_13, dta_23, dta_123的obs.和dim(column數)
#作成2x4的matrix