Data

pacman::p_load(WWGbook)
data(classroom, package="WWGbook")
?classroom  #help classroom
str(classroom)   #data structure of classroom
'data.frame':   1190 obs. of  12 variables:
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ yearstea: num  1 1 1 2 2 2 2 2 2 2 ...
 $ mathknow: num  NA NA NA -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 -0.11 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
 $ mathprep: num  2 2 2 3.25 3.25 3.25 3.25 3.25 3.25 3.25 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...
names(classroom)
 [1] "sex"      "minority" "mathkind" "mathgain" "ses"      "yearstea"
 [7] "mathknow" "housepov" "mathprep" "classid"  "schoolid" "childid" 

Split data

dta_sch<-table(classroom$schoolid)
#classroom$schoolid的值與次數作成table,存成dta_sch
#dta_sch|>as.data.frame()  表格太長先省略
#以data.frame的樣貌呈現dta_sch表格,發現有107個數值
dta_schl <- classroom[duplicated(classroom$schoolid)==FALSE, 
                     c("schoolid", "housepov")]

#schoolid除了第一次出現的數值外,刪除重複的值
#將檔案存成dta_schl,僅保留 c("schoolid", "housepov")

str(dta_schl)
'data.frame':   107 obs. of  2 variables:
 $ schoolid: int  1 2 3 4 5 6 7 8 9 10 ...
 $ housepov: num  0.082 0.082 0.086 0.365 0.511 0.044 0.148 0.085 0.537 0.346 ...
#共有107 obs. of  2 variables,與dta_sch一致
head(dta_schl)
   schoolid housepov
1         1    0.082
12        2    0.082
22        3    0.086
36        4    0.365
42        5    0.511
48        6    0.044
dta_sch2 <- classroom[duplicated(classroom$schoolid)==TRUE, 
                     c("schoolid", "housepov")]
#[duplicated(classroom$schoolid)==TRUE,刪除重複的第一筆資料,保留其他的資料。
str(dta_sch2)
'data.frame':   1083 obs. of  2 variables:
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
#共有1083 obs. of  2 variables
#str(classroom)有1190 obs.- str(dta_schl)有 170 obs.=1083 obs.
dta_cl<-table(classroom$classid)
#dta_cl|>as.data.frame() 表格太長先省略
#classid的值與分布次數統計,共有312個值(1~312)
dta_cls <- classroom[duplicated(classroom$classid)==FALSE, 
                     c(11, 10, 6,7,9)]
#classid除了第一次出現的數值外,刪除重複的值
#將檔案存成dta_cls,僅保留c(11, 10, 6, 7, 9) c("schoolid", "classid", "yearstea", "mathknow", "mathprep")
knitr::include_graphics("names(classroom).png") 

#原來的12個variables
str(dta_cls)
'data.frame':   312 obs. of  5 variables:
 $ schoolid: int  1 1 2 2 2 3 3 3 3 4 ...
 $ classid : int  160 217 197 211 307 11 137 145 228 48 ...
 $ yearstea: num  1 2 1 2 12.5 ...
 $ mathknow: num  NA -0.11 -1.25 -0.72 NA 0.45 0.99 1.61 1.14 -1.05 ...
 $ mathprep: num  2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
#共有312 obs. of  5 variables,與table(classroom$classid)結果一致
head(dta_cls)
   schoolid classid yearstea mathknow mathprep
1         1     160     1.00       NA     2.00
4         1     217     2.00    -0.11     3.25
12        2     197     1.00    -1.25     2.50
14        2     211     2.00    -0.72     2.33
18        2     307    12.54       NA     2.30
22        3      11    20.00     0.45     3.83
#列出前面六筆資料
knitr::include_graphics("names(classroom).png")

#原來的12個variables
dta_chld <- classroom[, c(12, 10, 11, 1:5)]
#保留classroom 第12、10、11、1~5共8個column,存成dta_chld
str(dta_chld)
'data.frame':   1190 obs. of  8 variables:
 $ childid : int  1 2 3 4 5 6 7 8 9 10 ...
 $ classid : int  160 160 160 217 217 217 217 217 217 217 ...
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ sex     : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain: int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses     : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
#data.frame':   1190 obs. of  8 variables
dta_chld|>as.data.frame()|>head(3)
  childid classid schoolid sex minority mathkind mathgain   ses
1       1     160        1   1        1      448       32  0.46
2       2     160        1   0        1      460      109 -0.27
3       3     160        1   1        1      511       56 -0.03
#用|>as.data.frame檢查一下classroom[, c(12, 10, 11, 1:5)]原來資料的順序一致,列舉前3筆資料
#Apply a Function over a List or Vector
dta_sap<-sapply(list(dta_schl, dta_cls, dta_chld), dim)
#運用sapply作成一個list
class(dta_sap)
[1] "matrix" "array" 
#是2x3 matrix也是array
str(dta_sap)
 int [1:2, 1:3] 107 2 312 5 1190 8
#資料皆為integer(整數)
dta_sap
     [,1] [,2] [,3]
[1,]  107  312 1190
[2,]    2    5    8
#把資料列出來

dta_sapp<-sapply(list(dta_schl$schoolid, dta_cls$schoolid, dta_chld$schoolid), mean)
#也可以用sapply列出資料的統計值,以mean為例
class(dta_sapp)
[1] "numeric"
#numeric
str(dta_sapp)
 num [1:3] 54 52.6 52.9
#資料皆為numeric(數值,有小數)
dta_sapp
[1] 54.00000 52.61859 52.94034
#把資料列出來,確實有小數

Combine data

dta_12 <- merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
#將x=dta_chld(1190 obs. of 8 variables), y=dta_cls(312 obs. of 5 variables)兩個 data.frame 透過欄位by=c("classid", "schoolid")一樣的進行合併
#dta_cls有【"classid"+"schoolid"】2個以外,還有另外6個【"childid"+"sex"+"minority"+"mathkind"+"mathgain"+"ses"】     
#dta_cls有【"classid"+"schoolid"】2個以外,還有另外3個【"yearstea"+"mathknow"+"mathprep"】
names(dta_12)
 [1] "classid"  "schoolid" "childid"  "sex"      "minority" "mathkind"
 [7] "mathgain" "ses"      "yearstea" "mathknow" "mathprep"
#合併後資料共有11個column
#合併後column names按照合併順序+"x=dta_chld"+"y=dta_cls"
str(dta_12)
'data.frame':   1190 obs. of  11 variables:
 $ classid : int  1 1 1 1 1 10 10 100 101 101 ...
 $ schoolid: int  61 61 61 61 61 17 17 33 20 20 ...
 $ childid : int  653 654 655 656 657 216 217 396 249 250 ...
 $ sex     : int  0 0 1 1 0 1 0 1 0 1 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  442 422 482 489 460 381 419 443 484 417 ...
 $ mathgain: int  35 109 66 19 10 143 69 39 68 79 ...
 $ ses     : num  -0.09 -1.32 0.43 -0.06 -1.28 -0.49 -1.35 -1.32 -0.98 -1.16 ...
 $ yearstea: num  2 2 2 2 2 5 5 19 6 6 ...
 $ mathknow: num  -0.72 -0.72 -0.72 -0.72 -0.72 -1.26 -1.26 0.12 -0.84 -0.84 ...
 $ mathprep: num  2.5 2.5 2.5 2.5 2.5 3.25 3.25 4 2.25 2.25 ...
#兩個data.frame合併by=c("classid", "schoolid"),有就保留
#合併後共有1190 obs. of  11 variables
dta_12c <- merge(x=dta_chld, y=dta_cls, by="classid")
#將x=dta_chld, y=dta_cls兩個 data.frame 透過欄位by="classid"進行合併
names(dta_12c)
 [1] "classid"    "childid"    "schoolid.x" "sex"        "minority"  
 [6] "mathkind"   "mathgain"   "ses"        "schoolid.y" "yearstea"  
[11] "mathknow"   "mathprep"  
#column變成12個,因為保留了X=dta_chld的"schoolid.x"和y=dta_cls的"schoolid.y"
str(dta_12c)
'data.frame':   1190 obs. of  12 variables:
 $ classid   : int  1 1 1 1 1 2 2 2 3 3 ...
 $ childid   : int  653 654 655 656 657 402 403 404 613 614 ...
 $ schoolid.x: int  61 61 61 61 61 34 34 34 56 56 ...
 $ sex       : int  0 0 1 1 0 0 0 1 1 1 ...
 $ minority  : int  1 1 1 1 1 1 0 1 0 0 ...
 $ mathkind  : int  442 422 482 489 460 453 472 437 506 516 ...
 $ mathgain  : int  35 109 66 19 10 60 52 85 -4 18 ...
 $ ses       : num  -0.09 -1.32 0.43 -0.06 -1.28 -0.41 -1.61 0.55 -0.43 -0.26 ...
 $ schoolid.y: int  61 61 61 61 61 34 34 34 56 56 ...
 $ yearstea  : num  2 2 2 2 2 2 2 2 13 13 ...
 $ mathknow  : num  -0.72 -0.72 -0.72 -0.72 -0.72 0.58 0.58 0.58 0.85 0.85 ...
 $ mathprep  : num  2.5 2.5 2.5 2.5 2.5 3 3 3 2.75 2.75 ...
#合併後共有1190 obs. of  12 variables(含"schoolid.x"、"schoolid.y")
#原來dta_13的題目和dta_12是一樣的,都是 merge(x=dta_chld, y=dta_cls, by=c("classid", "schoolid"))
#改一下dta_13<- merge(x=dta_chld, y=dta_cls, by="classid", "schoolid")
dta_13 <- merge(x=dta_chld, y=dta_cls, by="classid", "schoolid")
names(dta_13)
 [1] "schoolid"   "childid"    "classid"    "sex"        "minority"  
 [6] "mathkind"   "mathgain"   "ses"        "schoolid.y" "yearstea"  
[11] "mathknow"   "mathprep"  
#多了一個schoolid.y
str(dta_13)
'data.frame':   1190 obs. of  12 variables:
 $ schoolid  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid   : int  1 2 3 4 5 6 7 8 9 10 ...
 $ classid   : int  160 160 160 217 217 217 217 217 217 217 ...
 $ sex       : int  1 0 1 0 0 1 0 0 1 0 ...
 $ minority  : int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind  : int  448 460 511 449 425 450 452 443 422 480 ...
 $ mathgain  : int  32 109 56 83 53 65 51 66 88 -7 ...
 $ ses       : num  0.46 -0.27 -0.03 -0.38 -0.03 0.76 -0.03 0.2 0.64 0.13 ...
 $ schoolid.y: int  61 61 61 61 61 61 61 61 61 61 ...
 $ yearstea  : num  2 2 2 2 2 2 2 2 2 2 ...
 $ mathknow  : num  -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 -0.72 ...
 $ mathprep  : num  2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 2.5 ...
#1190 obs. of  12 variables

merge(x=dta_chld, y=dta_cls, by=c(“classid”, “schoolid”))

merge(x=dta_chld, y=dta_cls, by=“classid”, “schoolid”)

by=c(“name”)會以column處理,相同的column只會呈現x=data的資料

by=“name”,y=data若遇到x=data相同的coulmn,也會一併呈現

dta_23 <- merge(x=dta_cls, y=dta_schl, by="schoolid")
#將x=dta_cls(5 variances), y=dta_chld(8 variances)兩個 data.frame 透過欄位by="schoolid"進行合併
#x=dta_cls資料中對應y=dta_schl有"schoolid"併過去
names(dta_23)
[1] "schoolid" "classid"  "yearstea" "mathknow" "mathprep" "housepov"
#以x=dta_cls資料為主軸,共有6個variables(資料原來的那6個)
str(dta_23)
'data.frame':   312 obs. of  6 variables:
 $ schoolid: int  1 1 2 2 2 3 3 3 3 4 ...
 $ classid : int  160 217 197 211 307 11 137 145 228 48 ...
 $ yearstea: num  1 2 1 2 12.5 ...
 $ mathknow: num  NA -0.11 -1.25 -0.72 NA 0.45 0.99 1.61 1.14 -1.05 ...
 $ mathprep: num  2 3.25 2.5 2.33 2.3 3.83 2.25 3 2.17 2 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.086 0.086 0.086 0.086 0.365 ...
#剩下觀察值312 obs. of  6 variables,和原來dta_cls資料是一樣的
summary(dta_23)
    schoolid         classid          yearstea        mathknow       
 Min.   :  1.00   Min.   :  1.00   Min.   : 0.00   Min.   :-2.50000  
 1st Qu.: 24.75   1st Qu.: 78.75   1st Qu.: 4.00   1st Qu.:-0.76000  
 Median : 53.00   Median :156.50   Median :10.00   Median :-0.19000  
 Mean   : 52.62   Mean   :156.50   Mean   :12.28   Mean   :-0.08025  
 3rd Qu.: 79.00   3rd Qu.:234.25   3rd Qu.:20.00   3rd Qu.: 0.62000  
 Max.   :107.00   Max.   :312.00   Max.   :40.00   Max.   : 2.61000  
                                                   NA's   :27        
    mathprep        housepov     
 Min.   :1.000   Min.   :0.0120  
 1st Qu.:2.000   1st Qu.:0.0850  
 Median :2.300   Median :0.1420  
 Mean   :2.577   Mean   :0.1908  
 3rd Qu.:3.000   3rd Qu.:0.2630  
 Max.   :6.000   Max.   :0.5640  
                                 
summary(dta_cls)
    schoolid         classid          yearstea        mathknow       
 Min.   :  1.00   Min.   :  1.00   Min.   : 0.00   Min.   :-2.50000  
 1st Qu.: 24.75   1st Qu.: 78.75   1st Qu.: 4.00   1st Qu.:-0.76000  
 Median : 53.00   Median :156.50   Median :10.00   Median :-0.19000  
 Mean   : 52.62   Mean   :156.50   Mean   :12.28   Mean   :-0.08025  
 3rd Qu.: 79.00   3rd Qu.:234.25   3rd Qu.:20.00   3rd Qu.: 0.62000  
 Max.   :107.00   Max.   :312.00   Max.   :40.00   Max.   : 2.61000  
                                                   NA's   :27        
    mathprep    
 Min.   :1.000  
 1st Qu.:2.000  
 Median :2.300  
 Mean   :2.577  
 3rd Qu.:3.000  
 Max.   :6.000  
                
#確認(dta_23)和(dta_cls)一模一樣


dta_12s <- merge(x=dta_chld, y=dta_cls, by="schoolid")
#將x=dta_chld(8 variances), y=dta_cls(5 variances)兩個 data.frame 透過欄位by="schoolid"進行合併
#x=dta_chld其實包含了y=dta_cls所有資料
names(dta_12s)
 [1] "schoolid"  "childid"   "classid.x" "sex"       "minority"  "mathkind" 
 [7] "mathgain"  "ses"       "classid.y" "yearstea"  "mathknow"  "mathprep" 
#column變成12個,因為保留了X=dta_chld的"classid.x"和y=dta_cls的"classid.y"
str(dta_12s)
'data.frame':   4138 obs. of  12 variables:
 $ schoolid : int  1 1 1 1 1 1 1 1 1 1 ...
 $ childid  : int  1 1 2 2 3 3 4 4 5 5 ...
 $ classid.x: int  160 160 160 160 160 160 217 217 217 217 ...
 $ sex      : int  1 1 0 0 1 1 0 0 0 0 ...
 $ minority : int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind : int  448 448 460 460 511 511 449 449 425 425 ...
 $ mathgain : int  32 32 109 109 56 56 83 83 53 53 ...
 $ ses      : num  0.46 0.46 -0.27 -0.27 -0.03 -0.03 -0.38 -0.38 -0.03 -0.03 ...
 $ classid.y: int  160 217 160 217 160 217 160 217 160 217 ...
 $ yearstea : num  1 2 1 2 1 2 1 2 1 2 ...
 $ mathknow : num  NA -0.11 NA -0.11 NA -0.11 NA -0.11 NA -0.11 ...
 $ mathprep : num  2 3.25 2 3.25 2 3.25 2 3.25 2 3.25 ...
#不知道為何變成4138 obs???

dta_12s <- merge(x=dta_chld, y=dta_cls, by=“schoolid”) 不知道為何str(dta_12s)變成4138 obs???

dta_123 <- merge(x=dta_12, y=dta_schl, by=c("schoolid"))
names(dta_12)
 [1] "classid"  "schoolid" "childid"  "sex"      "minority" "mathkind"
 [7] "mathgain" "ses"      "yearstea" "mathknow" "mathprep"
#x=dta_12 1190 obs. of 11 variable
#"schoolid" "classid"  "childid"  "sex"  "minority" "mathkind" "mathgain" "ses"  "yearstea" "mathknow" "mathprep" "housepov"
names(dta_schl)
[1] "schoolid" "housepov"
#y=dta_schl 107 obs. of 2 variable
#"schoolid" "housepov"

names(dta_123)
 [1] "schoolid" "classid"  "childid"  "sex"      "minority" "mathkind"
 [7] "mathgain" "ses"      "yearstea" "mathknow" "mathprep" "housepov"
#以x=dta_12,依據c("schoolid")把y=dta_schl的資料帶入
#c("housepov")是重複的column,會自動融合只顯示一個
str(dta_123)
'data.frame':   1190 obs. of  12 variables:
 $ schoolid: int  1 1 1 1 1 1 1 1 1 1 ...
 $ classid : int  217 217 160 217 217 160 217 160 217 217 ...
 $ childid : int  6 9 1 8 11 2 10 3 7 4 ...
 $ sex     : int  1 1 1 0 0 0 0 1 0 0 ...
 $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
 $ mathkind: int  450 422 448 443 502 460 480 511 452 449 ...
 $ mathgain: int  65 88 32 66 60 109 -7 56 51 83 ...
 $ ses     : num  0.76 0.64 0.46 0.2 0.83 -0.27 0.13 -0.03 -0.03 -0.38 ...
 $ yearstea: num  2 2 1 2 2 1 2 1 2 2 ...
 $ mathknow: num  -0.11 -0.11 NA -0.11 -0.11 NA -0.11 NA -0.11 -0.11 ...
 $ mathprep: num  3.25 3.25 2 3.25 3.25 2 3.25 2 3.25 3.25 ...
 $ housepov: num  0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 0.082 ...
#'data.frame':  1190 obs. of  12 variables
sapply(list(dta_12, dta_13, dta_23, dta_123), dim)
     [,1] [,2] [,3] [,4]
[1,] 1190 1190  312 1190
[2,]   11   12    6   12
#apply dta_12, dta_13, dta_23, dta_123的obs.和dim(column數)
#作成2x4的matrix